Sync from SUSE:SLFO:Main mcelog revision aeb019e3941bfe1678e679fdfa2df45f

This commit is contained in:
Adrian Schröter 2024-05-03 16:43:37 +02:00
commit 466825ae5f
23 changed files with 3513 additions and 0 deletions

23
.gitattributes vendored Normal file
View File

@ -0,0 +1,23 @@
## Default LFS
*.7z filter=lfs diff=lfs merge=lfs -text
*.bsp filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.gem filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.jar filter=lfs diff=lfs merge=lfs -text
*.lz filter=lfs diff=lfs merge=lfs -text
*.lzma filter=lfs diff=lfs merge=lfs -text
*.obscpio filter=lfs diff=lfs merge=lfs -text
*.oxt filter=lfs diff=lfs merge=lfs -text
*.pdf filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.rpm filter=lfs diff=lfs merge=lfs -text
*.tbz filter=lfs diff=lfs merge=lfs -text
*.tbz2 filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.ttf filter=lfs diff=lfs merge=lfs -text
*.txz filter=lfs diff=lfs merge=lfs -text
*.whl filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text

78
README.email_setup Normal file
View File

@ -0,0 +1,78 @@
MACHINE CHECK EXCPETION NOTIFICATION VIA EMAIL
==============================================
(C)opyright by Thomas Renninger <trenn@suse.de> Novell Inc. 2010
The setup to send Machine Check Exceptions (MCEs) via email relies on a
working smtp server listening on localhost on port 25.
How this can easily be configured is can be read up here:
http://en.opensuse.org/Mail_server_HOWTO
in the "Outgoing" section.
Test your setup by trying to send test mails via the "mail" shell command,
included in the mailx package.
Specify the email address where the MCEs should get mailed to here:
/etc/sysconfig/mcelog
You can filter MCE mails by matching against these mail headers.
Either one of these headers are set:
- X-Mcelog-Uncorrectable
- X-Mcelog-Correctable
and one of these are set:
- X-Mcelog-Memory
- X-Mcelog-CPU
- X-Mcelog-Misc
NOTE: If broken HW results in an MCE storm of dozens and hundreds of MCEs,
mcelog will not sending them all to not overload the machine and network
traffic. If in doubt, check the local mcelog log files.
Autoyast
--------
For people making use of autoyast to spread similar installations on multiple
machines, here are some hints how to set up the email notification through
autoyast. Please read the autoyast documentation first if you are not familiar
with how to create an autoyast.xml file.
This simply sets the email address, notifications should get send to:
<sysconfig config:type="list">
<sysconfig_entry>
<sysconfig_key>MCELOG_ADMIN_EMAIL</sysconfig_key>
<sysconfig_path>/etc/sysconfig/mcelog</sysconfig_path>
<sysconfig_value>trenn@suse.de</sysconfig_value>
</sysconfig_entry>
</sysconfig>
This is an example of how to set up postfix to listen on localhost and
sending/forwarding all mails coming in there through the smtp server
relay.suse.de.
The alias at the beginning forwards local machine notifications sent to root,
to trenn@suse.de. Like that mails interesting for the administrator can easily
be collected and sent to one email address. But this is just one possible mail
set up example.
<mail>
<aliases config:type="list">
<alias>
<alias>root</alias>
<destinations>trenn@suse.de</destinations>
</alias>
</aliases>
<connection_type config:type="symbol">permanent</connection_type>
<listen_remote config:type="boolean">false</listen_remote>
<masquerade_other_domains config:type="list">
<domain>suse.de</domain>
</masquerade_other_domains>
<mta config:type="symbol">postfix</mta>
<outgoing_mail_server>relay.suse.de</outgoing_mail_server>
<postfix_mda config:type="symbol">local</postfix_mda>
<use_amavis config:type="boolean">false</use_amavis>
</mail>

View File

@ -0,0 +1,731 @@
From 4388981628ad9e2daba956210284017e1133cb99 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 7 May 2014 22:41:15 +0200
Subject: [PATCH] Start consolidating AMD-specific stuff
... in order to concentrate decoding for all families in amd.[ch]. Pass
down cpu type in decode_amd_mc.
Signed-off-by: Borislav Petkov <bp@suse.de>
---
Makefile | 2
amd.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
amd.h | 14 +++
k8.c | 281 --------------------------------------------------------------
k8.h | 11 --
mcelog.c | 8 -
6 files changed, 301 insertions(+), 297 deletions(-)
rename k8.c => amd.c (97%)
rename k8.h => amd.h (79%)
Index: mcelog-189/Makefile
===================================================================
--- mcelog-189.orig/Makefile
+++ mcelog-189/Makefile
@@ -31,7 +31,7 @@ all: mcelog
.PHONY: install install-nodoc clean depend FORCE
-OBJ := p4.o k8.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \
+OBJ := p4.o amd.o mcelog.o dmi.o tsc.o core2.o bitfield.o intel.o \
nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \
eventloop.o leaky-bucket.o memdb.o server.o trigger.o \
client.o cache.o sysfs.o yellow.o page.o rbtree.o \
Index: mcelog-189/amd.c
===================================================================
--- /dev/null
+++ mcelog-189/amd.c
@@ -0,0 +1,282 @@
+/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and
+ * Eric Morton. Hacked and extended for mcelog by AK.
+ * Extended to support all AMD families by Borislav Petkov, SUSE Labs.
+ *
+ * Original copyright:
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Additional K8 decoding and simplification Copyright 2003 Eric Morton, Newisys Inc
+ * K8 threshold counters decoding Copyright 2005,2006 Jacob Shin, AMD Inc.
+ *
+ * Subject to the GNU General Public License
+ */
+
+#include <stdio.h>
+#include "mcelog.h"
+#include "amd.h"
+
+static char *k8bank[] = {
+ "data cache",
+ "instruction cache",
+ "bus unit",
+ "load/store unit",
+ "northbridge",
+ "fixed-issue reoder"
+};
+static char *transaction[] = {
+ "instruction", "data", "generic", "reserved"
+};
+static char *cachelevel[] = {
+ "0", "1", "2", "generic"
+};
+static char *memtrans[] = {
+ "generic error", "generic read", "generic write", "data read",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
+ "?", "?", "?", "?", "?", "?", "?"
+};
+static char *partproc[] = {
+ "local node origin", "local node response",
+ "local node observed", "generic participation"
+};
+static char *timeout[] = {
+ "request didn't time out",
+ "request timed out"
+};
+static char *memoryio[] = {
+ "memory", "res.", "i/o", "generic"
+};
+static char *nbextendederr[] = {
+ "RAM ECC error",
+ "CRC error",
+ "Sync error",
+ "Master abort",
+ "Target abort",
+ "GART error",
+ "RMW error",
+ "Watchdog error",
+ "RAM Chipkill ECC error",
+ "DEV Error",
+ "Link Data Error",
+ "Link Protocol Error",
+ "NB Array Error",
+ "DRAM Parity Error",
+ "Link Retry",
+ "Tablew Walk Data Error",
+ "L3 Cache Data Error",
+ "L3 Cache Tag Error",
+ "L3 Cache LRU Error"
+};
+static char *highbits[32] = {
+ [31] = "valid",
+ [30] = "error overflow (multiple errors)",
+ [29] = "error uncorrected",
+ [28] = "error enable",
+ [27] = "misc error valid",
+ [26] = "error address valid",
+ [25] = "processor context corrupt",
+ [24] = "res24",
+ [23] = "res23",
+ /* 22-15 ecc syndrome bits */
+ [14] = "corrected ecc error",
+ [13] = "uncorrected ecc error",
+ [12] = "res12",
+ [11] = "L3 subcache in error bit 1",
+ [10] = "L3 subcache in error bit 0",
+ [9] = "sublink or DRAM channel",
+ [8] = "error found by scrub",
+ /* 7-4 ht link number of error */
+ [3] = "err cpu3",
+ [2] = "err cpu2",
+ [1] = "err cpu1",
+ [0] = "err cpu0",
+};
+static char *k8threshold[] = {
+ [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknow threshold counter",
+ [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold",
+ [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold",
+ [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold",
+ [K8_MCELOG_THRESHOLD_FBDIMM] = "MC4_MISC3 FBDIMM threshold",
+ [K8_MCELOG_THRESHOLD_FBDIMM + 1 ...
+ K8_MCE_THRESHOLD_TOP - K8_MCE_THRESHOLD_BASE - 1] =
+ "Unknown threshold counter",
+};
+
+
+static void decode_k8_generic_errcode(u64 status)
+{
+ unsigned short errcode = status & 0xffff;
+ int i;
+
+ for (i=0; i<32; i++) {
+ if (i==31 || i==28 || i==26)
+ continue;
+ if (highbits[i] && (status & (1ULL<<(i+32)))) {
+ Wprintf( " bit%d = %s\n", i+32, highbits[i]);
+ }
+ }
+
+ if ((errcode & 0xFFF0) == 0x0010) {
+ Wprintf( " TLB error '%s transaction, level %s'\n",
+ transaction[(errcode >> 2) & 3],
+ cachelevel[errcode & 3]);
+ }
+ else if ((errcode & 0xFF00) == 0x0100) {
+ Wprintf( " memory/cache error '%s mem transaction, %s transaction, level %s'\n",
+ memtrans[(errcode >> 4) & 0xf],
+ transaction[(errcode >> 2) & 3],
+ cachelevel[errcode & 3]);
+ }
+ else if ((errcode & 0xF800) == 0x0800) {
+ Wprintf( " bus error '%s, %s\n %s mem transaction\n %s access, level %s'\n",
+ partproc[(errcode >> 9) & 0x3],
+ timeout[(errcode >> 8) & 1],
+ memtrans[(errcode >> 4) & 0xf],
+ memoryio[(errcode >> 2) & 0x3],
+ cachelevel[(errcode & 0x3)]);
+ }
+}
+
+static void decode_k8_dc_mc(u64 status, int *err)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+ unsigned short errcode = status & 0xffff;
+
+ if(status&(3ULL<<45)) {
+ Wprintf( " Data cache ECC error (syndrome %x)",
+ (u32) (status >> 47) & 0xff);
+ if(status&(1ULL<<40)) {
+ Wprintf(" found by scrubber");
+ }
+ Wprintf("\n");
+ }
+
+ if ((errcode & 0xFFF0) == 0x0010) {
+ Wprintf( " TLB parity error in %s array\n",
+ (exterrcode == 0) ? "physical" : "virtual");
+ }
+
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_ic_mc(u64 status, int *err)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+ unsigned short errcode = status & 0xffff;
+
+ if(status&(3ULL<<45)) {
+ Wprintf(" Instruction cache ECC error\n");
+ }
+
+ if ((errcode & 0xFFF0) == 0x0010) {
+ Wprintf(" TLB parity error in %s array\n",
+ (exterrcode == 0) ? "physical" : "virtual");
+ }
+
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_bu_mc(u64 status, int *err)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+
+ if(status&(3ULL<<45)) {
+ Wprintf(" L2 cache ECC error\n");
+ }
+
+ Wprintf(" %s array error\n",
+ (exterrcode == 0) ? "Bus or cache" : "Cache tag");
+
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_ls_mc(u64 status, int *err)
+{
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_nb_mc(u64 status, int *memerr)
+{
+ unsigned short exterrcode = (status >> 16) & 0x0f;
+
+ Wprintf(" Northbridge %s\n", nbextendederr[exterrcode]);
+
+ switch (exterrcode) {
+ case 0:
+ *memerr = 1;
+ Wprintf(" ECC syndrome = %x\n",
+ (u32) (status >> 47) & 0xff);
+ break;
+ case 8:
+ *memerr = 1;
+ Wprintf(" Chipkill ECC syndrome = %x\n",
+ (u32) ((((status >> 24) & 0xff) << 8) | ((status >> 47) & 0xff)));
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 6:
+ Wprintf(" link number = %x\n",
+ (u32) (status >> 36) & 0xf);
+ break;
+ }
+
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_fr_mc(u64 status, int *err)
+{
+ decode_k8_generic_errcode(status);
+}
+
+static void decode_k8_threshold(u64 misc)
+{
+ if (misc & MCI_THRESHOLD_OVER)
+ Wprintf(" Threshold error count overflow\n");
+}
+
+typedef void (*decoder_t)(u64, int *ismemerr);
+
+static decoder_t decoders[] = {
+ [0] = decode_k8_dc_mc,
+ [1] = decode_k8_ic_mc,
+ [2] = decode_k8_bu_mc,
+ [3] = decode_k8_ls_mc,
+ [4] = decode_k8_nb_mc,
+ [5] = decode_k8_fr_mc,
+};
+
+void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr)
+{
+ if (mce->bank < NELE(decoders))
+ decoders[mce->bank](mce->status, ismemerr);
+ else if (mce->bank >= K8_MCE_THRESHOLD_BASE &&
+ mce->bank < K8_MCE_THRESHOLD_TOP)
+ decode_k8_threshold(mce->misc);
+ else
+ Wprintf(" no decoder for unknown bank %u\n", mce->bank);
+}
+
+char *k8_bank_name(unsigned num)
+{
+ static char buf[64];
+ char *s = "unknown";
+ if (num < NELE(k8bank))
+ s = k8bank[num];
+ else if (num >= K8_MCE_THRESHOLD_BASE &&
+ num < K8_MCE_THRESHOLD_TOP)
+ s = k8threshold[num - K8_MCE_THRESHOLD_BASE];
+ buf[sizeof(buf)-1] = 0;
+ snprintf(buf, sizeof(buf) - 1, "%u %s", num, s);
+ return buf;
+}
+
+int mce_filter_k8(struct mce *m)
+{
+ /* Filter out GART errors */
+ if (m->bank == 4) {
+ unsigned short exterrcode = (m->status >> 16) & 0x0f;
+ if (exterrcode == 5 && (m->status & (1ULL<<61)))
+ return 0;
+ }
+ return 1;
+}
Index: mcelog-189/amd.h
===================================================================
--- /dev/null
+++ mcelog-189/amd.h
@@ -0,0 +1,80 @@
+char *k8_bank_name(unsigned num);
+void decode_amd_mc(enum cputype, struct mce *mce, int *ismemerr);
+int mce_filter_k8(struct mce *m);
+
+#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
+#define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9)
+
+#define K8_MCELOG_THRESHOLD_DRAM_ECC (4 * 9 + 0)
+#define K8_MCELOG_THRESHOLD_LINK (4 * 9 + 1)
+#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2)
+#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3)
+
+#define EC(x) ((x) & 0xffff)
+#define XEC(x, mask) (((x) >> 16) & mask)
+
+#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
+#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
+
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
+
+#define TT(x) (((x) >> 2) & 0x3)
+#define TT_MSG(x) tt_msgs[TT(x)]
+#define II(x) (((x) >> 2) & 0x3)
+#define II_MSG(x) ii_msgs[II(x)]
+#define LL(x) ((x) & 0x3)
+#define LL_MSG(x) ll_msgs[LL(x)]
+#define TO(x) (((x) >> 8) & 0x1)
+#define TO_MSG(x) to_msgs[TO(x)]
+#define PP(x) (((x) >> 9) & 0x3)
+#define PP_MSG(x) pp_msgs[PP(x)]
+#define UU(x) (((x) >> 8) & 0x3)
+#define UU_MSG(x) uu_msgs[UU(x)]
+
+#define R4(x) (((x) >> 4) & 0xf)
+#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!")
+
+enum tt_ids {
+ TT_INSTR = 0,
+ TT_DATA,
+ TT_GEN,
+ TT_RESV,
+};
+
+enum ll_ids {
+ LL_RESV = 0,
+ LL_L1,
+ LL_L2,
+ LL_LG,
+};
+
+enum ii_ids {
+ II_MEM = 0,
+ II_RESV,
+ II_IO,
+ II_GEN,
+};
+
+enum rrrr_ids {
+ R4_GEN = 0,
+ R4_RD,
+ R4_WR,
+ R4_DRD,
+ R4_DWR,
+ R4_IRD,
+ R4_PREF,
+ R4_EVICT,
+ R4_SNOOP,
+};
+
+#define CASE_AMD_CPUS \
+ (cputype == CPU_K8 || \
+ cputype == CPU_F10H || \
+ cputype == CPU_F11H || \
+ cputype == CPU_F12H || \
+ cputype == CPU_F14H || \
+ cputype == CPU_F15H || \
+ cputype == CPU_F16H)
Index: mcelog-189/k8.c
===================================================================
--- mcelog-189.orig/k8.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Based on K8 decoding code written for the 2.4 kernel by Andi Kleen and
- * Eric Morton. Hacked and extended for mcelog by AK.
- *
- * Original copyright:
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Additional K8 decoding and simplification Copyright 2003 Eric Morton, Newisys Inc
- * K8 threshold counters decoding Copyright 2005,2006 Jacob Shin, AMD Inc.
- *
- * Subject to the GNU General Public License
- */
-
-#include <stdio.h>
-#include "mcelog.h"
-#include "k8.h"
-
-static char *k8bank[] = {
- "data cache",
- "instruction cache",
- "bus unit",
- "load/store unit",
- "northbridge",
- "fixed-issue reoder"
-};
-static char *transaction[] = {
- "instruction", "data", "generic", "reserved"
-};
-static char *cachelevel[] = {
- "0", "1", "2", "generic"
-};
-static char *memtrans[] = {
- "generic error", "generic read", "generic write", "data read",
- "data write", "instruction fetch", "prefetch", "evict", "snoop",
- "?", "?", "?", "?", "?", "?", "?"
-};
-static char *partproc[] = {
- "local node origin", "local node response",
- "local node observed", "generic participation"
-};
-static char *timeout[] = {
- "request didn't time out",
- "request timed out"
-};
-static char *memoryio[] = {
- "memory", "res.", "i/o", "generic"
-};
-static char *nbextendederr[] = {
- "RAM ECC error",
- "CRC error",
- "Sync error",
- "Master abort",
- "Target abort",
- "GART error",
- "RMW error",
- "Watchdog error",
- "RAM Chipkill ECC error",
- "DEV Error",
- "Link Data Error",
- "Link Protocol Error",
- "NB Array Error",
- "DRAM Parity Error",
- "Link Retry",
- "Tablew Walk Data Error",
- "L3 Cache Data Error",
- "L3 Cache Tag Error",
- "L3 Cache LRU Error"
-};
-static char *highbits[32] = {
- [31] = "valid",
- [30] = "error overflow (multiple errors)",
- [29] = "error uncorrected",
- [28] = "error enable",
- [27] = "misc error valid",
- [26] = "error address valid",
- [25] = "processor context corrupt",
- [24] = "res24",
- [23] = "res23",
- /* 22-15 ecc syndrome bits */
- [14] = "corrected ecc error",
- [13] = "uncorrected ecc error",
- [12] = "res12",
- [11] = "L3 subcache in error bit 1",
- [10] = "L3 subcache in error bit 0",
- [9] = "sublink or DRAM channel",
- [8] = "error found by scrub",
- /* 7-4 ht link number of error */
- [3] = "err cpu3",
- [2] = "err cpu2",
- [1] = "err cpu1",
- [0] = "err cpu0",
-};
-static char *k8threshold[] = {
- [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknown threshold counter",
- [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold",
- [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold",
- [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold",
- [K8_MCELOG_THRESHOLD_FBDIMM] = "MC4_MISC3 FBDIMM threshold",
- [K8_MCELOG_THRESHOLD_FBDIMM + 1 ...
- K8_MCE_THRESHOLD_TOP - K8_MCE_THRESHOLD_BASE - 1] =
- "Unknown threshold counter",
-};
-
-
-static void decode_k8_generic_errcode(u64 status)
-{
- unsigned short errcode = status & 0xffff;
- int i;
-
- for (i=0; i<32; i++) {
- if (i==31 || i==28 || i==26)
- continue;
- if (highbits[i] && (status & (1ULL<<(i+32)))) {
- Wprintf( " bit%d = %s\n", i+32, highbits[i]);
- }
- }
-
- if ((errcode & 0xFFF0) == 0x0010) {
- Wprintf( " TLB error '%s transaction, level %s'\n",
- transaction[(errcode >> 2) & 3],
- cachelevel[errcode & 3]);
- }
- else if ((errcode & 0xFF00) == 0x0100) {
- Wprintf( " memory/cache error '%s mem transaction, %s transaction, level %s'\n",
- memtrans[(errcode >> 4) & 0xf],
- transaction[(errcode >> 2) & 3],
- cachelevel[errcode & 3]);
- }
- else if ((errcode & 0xF800) == 0x0800) {
- Wprintf( " bus error '%s, %s\n %s mem transaction\n %s access, level %s'\n",
- partproc[(errcode >> 9) & 0x3],
- timeout[(errcode >> 8) & 1],
- memtrans[(errcode >> 4) & 0xf],
- memoryio[(errcode >> 2) & 0x3],
- cachelevel[(errcode & 0x3)]);
- }
-}
-
-static void decode_k8_dc_mc(u64 status, int *err)
-{
- unsigned short exterrcode = (status >> 16) & 0x0f;
- unsigned short errcode = status & 0xffff;
-
- if(status&(3ULL<<45)) {
- Wprintf( " Data cache ECC error (syndrome %x)",
- (u32) (status >> 47) & 0xff);
- if(status&(1ULL<<40)) {
- Wprintf(" found by scrubber");
- }
- Wprintf("\n");
- }
-
- if ((errcode & 0xFFF0) == 0x0010) {
- Wprintf( " TLB parity error in %s array\n",
- (exterrcode == 0) ? "physical" : "virtual");
- }
-
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_ic_mc(u64 status, int *err)
-{
- unsigned short exterrcode = (status >> 16) & 0x0f;
- unsigned short errcode = status & 0xffff;
-
- if(status&(3ULL<<45)) {
- Wprintf(" Instruction cache ECC error\n");
- }
-
- if ((errcode & 0xFFF0) == 0x0010) {
- Wprintf(" TLB parity error in %s array\n",
- (exterrcode == 0) ? "physical" : "virtual");
- }
-
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_bu_mc(u64 status, int *err)
-{
- unsigned short exterrcode = (status >> 16) & 0x0f;
-
- if(status&(3ULL<<45)) {
- Wprintf(" L2 cache ECC error\n");
- }
-
- Wprintf(" %s array error\n",
- (exterrcode == 0) ? "Bus or cache" : "Cache tag");
-
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_ls_mc(u64 status, int *err)
-{
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_nb_mc(u64 status, int *memerr)
-{
- unsigned short exterrcode = (status >> 16) & 0x0f;
-
- Wprintf(" Northbridge %s\n", nbextendederr[exterrcode]);
-
- switch (exterrcode) {
- case 0:
- *memerr = 1;
- Wprintf(" ECC syndrome = %x\n",
- (u32) (status >> 47) & 0xff);
- break;
- case 8:
- *memerr = 1;
- Wprintf(" Chipkill ECC syndrome = %x\n",
- (u32) ((((status >> 24) & 0xff) << 8) | ((status >> 47) & 0xff)));
- break;
- case 1:
- case 2:
- case 3:
- case 4:
- case 6:
- Wprintf(" link number = %x\n",
- (u32) (status >> 36) & 0xf);
- break;
- }
-
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_fr_mc(u64 status, int *err)
-{
- decode_k8_generic_errcode(status);
-}
-
-static void decode_k8_threshold(u64 misc)
-{
- if (misc & MCI_THRESHOLD_OVER)
- Wprintf(" Threshold error count overflow\n");
-}
-
-typedef void (*decoder_t)(u64, int *ismemerr);
-
-static decoder_t decoders[] = {
- [0] = decode_k8_dc_mc,
- [1] = decode_k8_ic_mc,
- [2] = decode_k8_bu_mc,
- [3] = decode_k8_ls_mc,
- [4] = decode_k8_nb_mc,
- [5] = decode_k8_fr_mc,
-};
-
-void decode_k8_mc(struct mce *mce, int *ismemerr)
-{
- if (mce->bank < NELE(decoders))
- decoders[mce->bank](mce->status, ismemerr);
- else if (mce->bank >= K8_MCE_THRESHOLD_BASE &&
- mce->bank < K8_MCE_THRESHOLD_TOP)
- decode_k8_threshold(mce->misc);
- else
- Wprintf(" no decoder for unknown bank %u\n", mce->bank);
-}
-
-char *k8_bank_name(unsigned num)
-{
- static char buf[64];
- char *s = "unknown";
- if (num < NELE(k8bank))
- s = k8bank[num];
- else if (num >= K8_MCE_THRESHOLD_BASE &&
- num < K8_MCE_THRESHOLD_TOP)
- s = k8threshold[num - K8_MCE_THRESHOLD_BASE];
- buf[sizeof(buf)-1] = 0;
- snprintf(buf, sizeof(buf) - 1, "%u %s", num, s);
- return buf;
-}
-
-int mce_filter_k8(struct mce *m)
-{
- /* Filter out GART errors */
- if (m->bank == 4) {
- unsigned short exterrcode = (m->status >> 16) & 0x0f;
- if (exterrcode == 5 && (m->status & (1ULL<<61)))
- return 0;
- }
- return 1;
-}
Index: mcelog-189/k8.h
===================================================================
--- mcelog-189.orig/k8.h
+++ /dev/null
@@ -1,11 +0,0 @@
-char *k8_bank_name(unsigned num);
-void decode_k8_mc(struct mce *mce, int *ismemerr);
-int mce_filter_k8(struct mce *m);
-
-#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
-#define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9)
-
-#define K8_MCELOG_THRESHOLD_DRAM_ECC (4 * 9 + 0)
-#define K8_MCELOG_THRESHOLD_LINK (4 * 9 + 1)
-#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2)
-#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3)
Index: mcelog-189/mcelog.c
===================================================================
--- mcelog-189.orig/mcelog.c
+++ mcelog-189/mcelog.c
@@ -41,7 +41,7 @@
#include <fnmatch.h>
#include "mcelog.h"
#include "paths.h"
-#include "k8.h"
+#include "amd.h"
#include "intel.h"
#include "p4.h"
#include "dmi.h"
@@ -346,8 +346,8 @@ static void dump_mce(struct mce *m, unsi
time_t t = m->time;
Wprintf("TIME %llu %s", m->time, ctime(&t));
}
- if (cputype == CPU_K8)
- decode_k8_mc(m, &ismemerr);
+ if CASE_AMD_CPUS
+ decode_amd_mc(m, &ismemerr);
else if (cputype >= CPU_INTEL)
decode_intel_mc(m, cputype, &ismemerr, recordlen);
/* else add handlers for other CPUs here */

15
_service Normal file
View File

@ -0,0 +1,15 @@
<services>
<service name="obs_scm" mode="localonly">
<param name="scm">git</param>
<param name="url">https://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git</param>
<param name="changesgenerate">enable</param>
<param name="versionrewrite-pattern">v(.*)</param>
<param name="versionformat">@PARENT_TAG@</param>
</service>
<service name="set_version" mode="localonly"/>
<service name="tar" mode="buildtime"/>
<service name="recompress" mode="buildtime">
<param name="file">*.tar</param>
<param name="compression">gz</param>
</service>
</services>

10
_servicedata Normal file
View File

@ -0,0 +1,10 @@
<servicedata>
<service name="tar_scm">
<param name="url">https://github.com/andikleen/mcelog</param>
<param name="changesrevision">ee90ff20ce6a4d5e016aa249ce8b37f359f9fda4</param></service><service name="tar_scm">
<param name="url">git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git</param>
<param name="changesrevision">04d51981e8805c4200f5a03b4216c8621bc52ace</param></service><service name="tar_scm">
<param name="url">https://github.com/andikleen/mcelog.git</param>
<param name="changesrevision">1f3a769c8fb736815a56ea104b7b751c5565cb88</param></service><service name="tar_scm">
<param name="url">https://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git</param>
<param name="changesrevision">edfe78a0dc54a940f4916a9bd681eab7b3f746d1</param></service></servicedata>

683
add-f10h-support.patch Normal file
View File

@ -0,0 +1,683 @@
Add F10h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 488 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
amd.h | 42 ++++-
mcelog.c | 26 +--
mcelog.h | 1
4 files changed, 506 insertions(+), 51 deletions(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -14,7 +14,7 @@
#include "mcelog.h"
#include "amd.h"
-static char *k8bank[] = {
+static const char * const k8bank[] = {
"data cache",
"instruction cache",
"bus unit",
@@ -22,28 +22,34 @@ static char *k8bank[] = {
"northbridge",
"fixed-issue reoder"
};
-static char *transaction[] = {
+static const char * const transaction[] = {
"instruction", "data", "generic", "reserved"
-};
-static char *cachelevel[] = {
+};
+static const char * const cachelevel[] = {
"0", "1", "2", "generic"
};
-static char *memtrans[] = {
+static const char * const memtrans[] = {
"generic error", "generic read", "generic write", "data read",
"data write", "instruction fetch", "prefetch", "evict", "snoop",
"?", "?", "?", "?", "?", "?", "?"
};
-static char *partproc[] = {
- "local node origin", "local node response",
- "local node observed", "generic participation"
+static const char * const partproc[] = {
+ "local node origin",
+ "local node response",
+ "local node observed",
+ "generic participation"
};
-static char *timeout[] = {
+static const char * const timeout[] = {
"request didn't time out",
"request timed out"
};
-static char *memoryio[] = {
+static const char * const memoryio[] = {
"memory", "res.", "i/o", "generic"
};
+
+/* internal error type */
+static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
+
static char *nbextendederr[] = {
"RAM ECC error",
"CRC error",
@@ -65,6 +71,46 @@ static char *nbextendederr[] = {
"L3 Cache Tag Error",
"L3 Cache LRU Error"
};
+
+static const char * const mc4_mce_desc[] = {
+ "DRAM ECC error detected on the NB",
+ "CRC error detected on HT link",
+ "Link-defined sync error packets detected on HT link",
+ "HT Master abort",
+ "HT Target abort",
+ "Invalid GART PTE entry during GART table walk",
+ "Unsupported atomic RMW received from an IO link",
+ "Watchdog timeout due to lack of progress",
+ "DRAM ECC error detected on the NB",
+ "SVM DMA Exclusion Vector error",
+ "HT data error detected on link",
+ "Protocol error (link, L3, probe filter)",
+ "NB internal arrays parity error",
+ "DRAM addr/ctl signals parity error",
+ "IO link transmission error",
+ "L3 data cache ECC error", /* xec = 0x1c */
+ "L3 cache tag error",
+ "L3 LRU parity bits error",
+ "ECC Error in the Probe Filter directory"
+};
+
+static const char * const mc5_mce_desc[] = {
+ "CPU Watchdog timer expire",
+ "Wakeup array dest tag",
+ "AG payload array",
+ "EX payload array",
+ "IDRF array",
+ "Retire dispatch queue",
+ "Mapper checkpoint array",
+ "Physical register file EX0 port",
+ "Physical register file EX1 port",
+ "Physical register file AG0 port",
+ "Physical register file AG1 port",
+ "Flag register file",
+ "DE error occurred",
+ "Retire status queue"
+};
+
static char *highbits[32] = {
[31] = "valid",
[30] = "error overflow (multiple errors)",
@@ -100,6 +146,21 @@ static char *k8threshold[] = {
"Unknown threshold counter",
};
+static u8 xec_mask = 0xf;
+
+enum cputype select_amd_cputype(u32 family)
+{
+ switch (family) {
+ case 0xf:
+ return CPU_K8;
+ case 0x10:
+ return CPU_F10H;
+ default:
+ break;
+ }
+
+ return CPU_GENERIC;
+}
static void decode_k8_generic_errcode(u64 status)
{
@@ -245,21 +306,393 @@ static decoder_t decoders[] = {
[5] = decode_k8_fr_mc,
};
-void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr)
+static bool k8_mc1_mce(u16 ec, u8 xec)
+{
+ u8 ll = LL(ec);
+ bool ret = true;
+
+ if (!MEM_ERROR(ec))
+ return false;
+
+ if (ll == 0x2)
+ Wprintf("during a linefill from L2.\n");
+ else if (ll == 0x1) {
+ switch (R4(ec)) {
+ case R4_IRD:
+ Wprintf("Parity error during data load.\n");
+ break;
+
+ case R4_EVICT:
+ Wprintf("Copyback Parity/Victim error.\n");
+ break;
+
+ case R4_SNOOP:
+ Wprintf("Tag Snoop error.\n");
+ break;
+
+ default:
+ ret = false;
+ break;
+ }
+ } else
+ ret = false;
+
+ return ret;
+}
+
+static bool f12h_mc0_mce(u16 ec, u8 xec)
+{
+ bool ret = false;
+
+ if (MEM_ERROR(ec)) {
+ u8 ll = LL(ec);
+ ret = true;
+
+ if (ll == LL_L2)
+ Wprintf("aduring L1 linefill from L2.\n");
+ else if (ll == LL_L1)
+ Wprintf("Data/Tag %s error.\n", R4_MSG(ec));
+ else
+ ret = false;
+ }
+ return ret;
+}
+
+static bool f10h_mc0_mce(u16 ec, u8 xec)
+{
+ if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
+ Wprintf("during data scrub.\n");
+ return true;
+ }
+ return f12h_mc0_mce(ec, xec);
+}
+
+static void decode_mc0_mce(struct amd_decoder_ops *ops, struct mce *m)
+{
+ u16 ec = EC(m->status);
+ u8 xec = XEC(m->status, xec_mask);
+
+ Wprintf(" MC0 Error: ");
+
+ /* TLB error signatures are the same across families */
+ if (TLB_ERROR(ec)) {
+ if (TT(ec) == TT_DATA) {
+ Wprintf("%s TLB %s.\n", LL_MSG(ec),
+ ((xec == 2) ? "locked miss"
+ : (xec ? "multimatch" : "parity")));
+ return;
+ }
+ } else if (ops->mc0_mce(ec, xec))
+ ;
+ else
+ Eprintf("Corrupted MC0 MCE info?\n");
+}
+
+static void decode_mc1_mce(struct amd_decoder_ops *ops, struct mce *m)
{
- if (mce->bank < NELE(decoders))
- decoders[mce->bank](mce->status, ismemerr);
- else if (mce->bank >= K8_MCE_THRESHOLD_BASE &&
- mce->bank < K8_MCE_THRESHOLD_TOP)
- decode_k8_threshold(mce->misc);
+ u16 ec = EC(m->status);
+ u8 xec = XEC(m->status, xec_mask);
+
+ Wprintf(" MC1 Error: ");
+
+ if (TLB_ERROR(ec))
+ Wprintf("%s TLB %s.\n", LL_MSG(ec),
+ (xec ? "multimatch" : "parity error"));
+ else if (BUS_ERROR(ec)) {
+ bool k8 = ((ops->cpu == AMD_K8) && (m->status & BIT_64(58)));
+
+ Wprintf("during %s.\n", (k8 ? "system linefill" : "NB data read"));
+ } else if (ops->mc1_mce(ec, xec))
+ ;
else
- Wprintf(" no decoder for unknown bank %u\n", mce->bank);
+ Eprintf("Corrupted MC1 MCE info?\n");
+}
+
+static bool k8_mc2_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+
+ if (xec == 0x1)
+ Wprintf(" in the write data buffers.\n");
+ else if (xec == 0x3)
+ Wprintf(" in the victim data buffers.\n");
+ else if (xec == 0x2 && MEM_ERROR(ec))
+ Wprintf(": %s error in the L2 cache tags.\n", R4_MSG(ec));
+ else if (xec == 0x0) {
+ if (TLB_ERROR(ec))
+ Wprintf(": %s error in a Page Descriptor Cache or "
+ "Guest TLB.\n", TT_MSG(ec));
+ else if (BUS_ERROR(ec))
+ Wprintf(": %s/ECC error in data read from NB: %s.\n",
+ R4_MSG(ec), PP_MSG(ec));
+ else if (MEM_ERROR(ec)) {
+ u8 r4 = R4(ec);
+
+ if (r4 >= 0x7)
+ Wprintf(": %s error during data copyback.\n",
+ R4_MSG(ec));
+ else if (r4 <= 0x1)
+ Wprintf(": %s parity/ECC error during data "
+ "access from L2.\n", R4_MSG(ec));
+ else
+ ret = false;
+ } else
+ ret = false;
+ } else
+ ret = false;
+
+ return ret;
+}
+
+static void decode_mc2_mce(struct amd_decoder_ops *ops, struct mce *m)
+{
+ u16 ec = EC(m->status);
+ u8 xec = XEC(m->status, xec_mask);
+
+ Wprintf(" MC2 Error: ");
+
+ if (!ops->mc2_mce(ec, xec))
+ Eprintf("Corrupted MC2 MCE info?\n");
+}
+
+static void decode_mc3_mce(struct amd_decoder_ops *ops, struct mce *m)
+{
+ u16 ec = EC(m->status);
+ u8 xec = XEC(m->status, xec_mask);
+
+ if (ops->cpu >= AMD_F14H) {
+ Eprintf("You shouldn't be seeing MC3 MCE on this cpu family,"
+ " please report on LKML.\n");
+ return;
+ }
+
+ Wprintf(" MC3 Error");
+
+ if (xec == 0x0) {
+ u8 r4 = R4(ec);
+
+ if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
+ goto wrong_mc3_mce;
+
+ Wprintf(" during %s.\n", R4_MSG(ec));
+ } else
+ goto wrong_mc3_mce;
+
+ return;
+
+wrong_mc3_mce:
+ Eprintf("Corrupted MC3 MCE info?\n");
+}
+
+static void decode_mc4_mce(struct amd_decoder_ops *ops, struct mce *m)
+{
+ u16 ec = EC(m->status);
+ u8 xec = XEC(m->status, 0x1f);
+ u8 offset = 0;
+
+ Wprintf(" MC4 Error: ");
+
+ switch (xec) {
+ case 0x0 ... 0xe:
+
+ /* special handling for DRAM ECCs */
+ if (xec == 0x0 || xec == 0x8) {
+ /* no ECCs on F11h */
+ if (ops->cpu == AMD_F11H)
+ goto wrong_mc4_mce;
+
+ Wprintf("%s.\n", mc4_mce_desc[xec]);
+ return;
+ }
+ break;
+
+ case 0xf:
+ if (TLB_ERROR(ec))
+ Wprintf("GART Table Walk data error.\n");
+ else if (BUS_ERROR(ec))
+ Wprintf("DMA Exclusion Vector Table Walk error.\n");
+ else
+ goto wrong_mc4_mce;
+ return;
+
+ case 0x19:
+ if (ops->cpu >= AMD_F15H || ops->cpu <= AMD_F16H)
+ Wprintf("Compute Unit Data Error.\n");
+ else
+ goto wrong_mc4_mce;
+ return;
+
+ case 0x1c ... 0x1f:
+ offset = 13;
+ break;
+
+ default:
+ goto wrong_mc4_mce;
+ }
+
+ Wprintf("%s.\n", mc4_mce_desc[xec - offset]);
+ return;
+
+ wrong_mc4_mce:
+ Eprintf("Corrupted MC4 MCE info?\n");
+}
+
+static void decode_mc5_mce(struct amd_decoder_ops *ops, struct mce *m)
+{
+ u8 xec = XEC(m->status, xec_mask);
+
+ if (ops->cpu == AMD_K8 || ops->cpu == AMD_F11H)
+ goto wrong_mc5_mce;
+
+ Wprintf(" MC5 Error: ");
+
+ if (xec == 0x0 || xec == 0xc)
+ Wprintf("%s.\n", mc5_mce_desc[xec]);
+ else if (xec <= 0xd)
+ Wprintf("%s parity error.\n", mc5_mce_desc[xec]);
+ else
+ goto wrong_mc5_mce;
+
+ return;
+
+ wrong_mc5_mce:
+ Eprintf("Corrupted MC5 MCE info?\n");
+}
+
+static void decode_mc6_mce(struct mce *m)
+{
+ u8 xec = XEC(m->status, xec_mask);
+
+ Wprintf(" MC6 Error: ");
+
+ switch (xec) {
+ case 0x1:
+ Wprintf("Free List");
+ break;
+
+ case 0x2:
+ Wprintf("Physical Register File");
+ break;
+
+ case 0x3:
+ Wprintf("Retire Queue");
+ break;
+
+ case 0x4:
+ Wprintf("Scheduler table");
+ break;
+
+ case 0x5:
+ Wprintf("Status Register File");
+ break;
+
+ default:
+ goto wrong_mc6_mce;
+ break;
+ }
+
+ Wprintf(" parity error.\n");
+
+ return;
+
+ wrong_mc6_mce:
+ Eprintf("Corrupted MC6 MCE info?\n");
+}
+
+static inline void amd_decode_err_code(u16 ec)
+{
+ if (INT_ERROR(ec)) {
+ Wprintf(" internal: %s\n", UU_MSG(ec));
+ return;
+ }
+
+ Wprintf(" cache level: %s", LL_MSG(ec));
+
+ if (BUS_ERROR(ec))
+ Wprintf(", mem/io: %s", II_MSG(ec));
+ else
+ Wprintf(", tx: %s", TT_MSG(ec));
+
+ if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
+ Wprintf(", mem-tx: %s", R4_MSG(ec));
+
+ if (BUS_ERROR(ec))
+ Wprintf(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
+ }
+
+ Wprintf("\n");
+}
+
+struct amd_decoder_ops fam_ops[] = {
+ [AMD_F10H] = {
+ .cpu = AMD_F10H,
+ .mc0_mce = f10h_mc0_mce,
+ .mc1_mce = k8_mc1_mce,
+ .mc2_mce = k8_mc2_mce,
+ },
+};
+
+static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
+{
+ struct amd_decoder_ops *ops;
+
+ switch (cpu) {
+ case CPU_F10H:
+ ops = &fam_ops[AMD_F10H];
+ break;
+ default:
+ Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
+ return;
+ break;
+ }
+
+ switch (mce->bank) {
+ case 0:
+ decode_mc0_mce(ops, mce);
+ break;
+ case 1:
+ decode_mc1_mce(ops, mce);
+ break;
+ case 2:
+ decode_mc2_mce(ops, mce);
+ break;
+ case 3:
+ decode_mc3_mce(ops, mce);
+ break;
+ case 4:
+ decode_mc4_mce(ops, mce);
+ break;
+ case 5:
+ decode_mc5_mce(ops, mce);
+ break;
+ case 6:
+ decode_mc6_mce(mce);
+ break;
+
+ default:
+ break;
+ }
+ amd_decode_err_code(mce->status & 0xffff);
+}
+
+void decode_amd_mc(enum cputype cpu, struct mce *mce, int *ismemerr)
+{
+ if (cpu == CPU_K8) {
+ if (mce->bank < NELE(decoders))
+ decoders[mce->bank](mce->status, ismemerr);
+ else if (mce->bank >= K8_MCE_THRESHOLD_BASE &&
+ mce->bank < K8_MCE_THRESHOLD_TOP)
+ decode_k8_threshold(mce->misc);
+ else
+ Wprintf(" no decoder for unknown bank %u\n", mce->bank);
+ } else
+ __decode_amd_mc(cpu, mce);
}
char *k8_bank_name(unsigned num)
{
static char buf[64];
- char *s = "unknown";
+ const char *s = "unknown";
if (num < NELE(k8bank))
s = k8bank[num];
else if (num >= K8_MCE_THRESHOLD_BASE &&
@@ -270,13 +703,16 @@ char *k8_bank_name(unsigned num)
return buf;
}
-int mce_filter_k8(struct mce *m)
-{
- /* Filter out GART errors */
- if (m->bank == 4) {
- unsigned short exterrcode = (m->status >> 16) & 0x0f;
- if (exterrcode == 5 && (m->status & (1ULL<<61)))
+int mce_filter_amd(struct mce *m)
+{
+ /*
+ * NB GART TLB error reporting is disabled by default.
+ */
+ if (m->bank == 4) {
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ if (xec == 0x5 && (m->status & BIT_64(61)))
return 0;
- }
- return 1;
+ }
+ return 1;
}
Index: mcelog-189/amd.h
===================================================================
--- mcelog-189.orig/amd.h
+++ mcelog-189/amd.h
@@ -1,6 +1,25 @@
+#include <stdbool.h>
+
char *k8_bank_name(unsigned num);
void decode_amd_mc(enum cputype, struct mce *mce, int *ismemerr);
-int mce_filter_k8(struct mce *m);
+int mce_filter_amd(struct mce *m);
+enum cputype select_amd_cputype(u32 family);
+
+enum amdcpu {
+ AMD_K8 = 0,
+ AMD_F10H,
+ AMD_F11H,
+ AMD_F14H,
+ AMD_F15H,
+ AMD_F16H,
+};
+
+struct amd_decoder_ops {
+ enum amdcpu cpu;
+ bool (*mc0_mce)(u16, u8);
+ bool (*mc1_mce)(u16, u8);
+ bool (*mc2_mce)(u16, u8);
+};
#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
#define K8_MCE_THRESHOLD_TOP (K8_MCE_THRESHOLD_BASE + 6 * 9)
@@ -10,6 +29,8 @@ int mce_filter_k8(struct mce *m);
#define K8_MCELOG_THRESHOLD_L3_CACHE (4 * 9 + 2)
#define K8_MCELOG_THRESHOLD_FBDIMM (4 * 9 + 3)
+#define BIT_64(n) (1ULL << (n))
+
#define EC(x) ((x) & 0xffff)
#define XEC(x, mask) (((x) >> 16) & mask)
@@ -22,20 +43,20 @@ int mce_filter_k8(struct mce *m);
#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
#define TT(x) (((x) >> 2) & 0x3)
-#define TT_MSG(x) tt_msgs[TT(x)]
+#define TT_MSG(x) transaction[TT(x)]
#define II(x) (((x) >> 2) & 0x3)
-#define II_MSG(x) ii_msgs[II(x)]
+#define II_MSG(x) memoryio[II(x)]
#define LL(x) ((x) & 0x3)
-#define LL_MSG(x) ll_msgs[LL(x)]
+#define LL_MSG(x) cachelevel[LL(x)]
#define TO(x) (((x) >> 8) & 0x1)
-#define TO_MSG(x) to_msgs[TO(x)]
+#define TO_MSG(x) timeout[TO(x)]
#define PP(x) (((x) >> 9) & 0x3)
-#define PP_MSG(x) pp_msgs[PP(x)]
+#define PP_MSG(x) partproc[PP(x)]
#define UU(x) (((x) >> 8) & 0x3)
#define UU_MSG(x) uu_msgs[UU(x)]
#define R4(x) (((x) >> 4) & 0xf)
-#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!")
+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!")
enum tt_ids {
TT_INSTR = 0,
Index: mcelog-189/mcelog.c
===================================================================
--- mcelog-189.orig/mcelog.c
+++ mcelog-189/mcelog.c
@@ -152,8 +152,8 @@ static int mce_filter(struct mce *m, uns
/* Filter out known broken MCEs */
if (cputype >= CPU_INTEL)
return mce_filter_intel(m, recordlen);
- else if (cputype == CPU_K8)
- return mce_filter_k8(m);
+ else if CASE_AMD_CPUS
+ return mce_filter_amd(m);
return 1;
}
@@ -283,9 +283,7 @@ static enum cputype setup_cpuid(u32 cpuv
case X86_VENDOR_INTEL:
return select_intel_cputype(family, model);
case X86_VENDOR_AMD:
- if (family >= 15 && family <= 17)
- return CPU_K8;
- /* FALL THROUGH */
+ return select_amd_cputype(family);
default:
Eprintf("Unknown CPU type vendor %u family %u model %u",
cpuvendor, family, model);
@@ -347,7 +345,7 @@ static void dump_mce(struct mce *m, unsi
Wprintf("TIME %llu %s", m->time, ctime(&t));
}
if CASE_AMD_CPUS
- decode_amd_mc(m, &ismemerr);
+ decode_amd_mc(cputype, m, &ismemerr);
else if (cputype >= CPU_INTEL)
decode_intel_mc(m, cputype, &ismemerr, recordlen);
/* else add handlers for other CPUs here */
@@ -463,14 +461,9 @@ int is_cpu_supported(void)
}
if (seen == ALL) {
- if (!strcmp(vendor,"AuthenticAMD")) {
- if (family == 15) {
- cputype = CPU_K8;
- } else if (family >= 16) {
- Eprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family);
- return 0;
- }
- } else if (!strcmp(vendor,"HygonGenuine")) {
+ if (!strcmp(vendor,"AuthenticAMD"))
+ cputype = select_amd_cputype(family);
+ else if (!strcmp(vendor,"HygonGenuine")) {
Eprintf("ERROR: Hygon Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family);
return 0;
} else if (!strcmp(vendor,"GenuineIntel"))

63
add-f11h-support.patch Normal file
View File

@ -0,0 +1,63 @@
Add F11h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 21 +++++++++++++++++++++
amd.h | 3 ++-
mcelog.c | 2 ++
mcelog.h | 1 +
4 files changed, 26 insertions(+), 1 deletion(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -155,6 +155,8 @@ enum cputype select_amd_cputype(u32 fami
return CPU_K8;
case 0x10:
return CPU_F10H;
+ case 0x11:
+ return CPU_F11H;
default:
break;
}
@@ -367,6 +369,16 @@ static bool f10h_mc0_mce(u16 ec, u8 xec)
return f12h_mc0_mce(ec, xec);
}
+static bool k8_mc0_mce(u16 ec, u8 xec)
+{
+ if (BUS_ERROR(ec)) {
+ Wprintf("during system linefill.\n");
+ return true;
+ }
+
+ return f10h_mc0_mce(ec, xec);
+}
+
static void decode_mc0_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -630,6 +642,12 @@ struct amd_decoder_ops fam_ops[] = {
.mc1_mce = k8_mc1_mce,
.mc2_mce = k8_mc2_mce,
},
+ [AMD_F11H] = {
+ .cpu = AMD_F11H,
+ .mc0_mce = k8_mc0_mce,
+ .mc1_mce = k8_mc1_mce,
+ .mc2_mce = k8_mc2_mce,
+ },
};
static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
@@ -640,6 +658,9 @@ static void __decode_amd_mc(enum cputype
case CPU_F10H:
ops = &fam_ops[AMD_F10H];
break;
+ case CPU_F11H:
+ ops = &fam_ops[AMD_F11H];
+ break;
default:
Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
return;

58
add-f12h-support.patch Normal file
View File

@ -0,0 +1,58 @@
Add F12h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 11 +++++++++++
amd.h | 4 +++-
mcelog.c | 2 ++
mcelog.h | 1 +
4 files changed, 17 insertions(+), 1 deletion(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -157,6 +157,8 @@ enum cputype select_amd_cputype(u32 fami
return CPU_F10H;
case 0x11:
return CPU_F11H;
+ case 0x12:
+ return CPU_F12H;
default:
break;
}
@@ -648,6 +650,12 @@ struct amd_decoder_ops fam_ops[] = {
.mc1_mce = k8_mc1_mce,
.mc2_mce = k8_mc2_mce,
},
+ [AMD_F12H] = {
+ .cpu = AMD_F12H,
+ .mc0_mce = f12h_mc0_mce,
+ .mc1_mce = k8_mc1_mce,
+ .mc2_mce = k8_mc2_mce,
+ },
};
static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
@@ -661,6 +669,9 @@ static void __decode_amd_mc(enum cputype
case CPU_F11H:
ops = &fam_ops[AMD_F11H];
break;
+ case CPU_F12H:
+ ops = &fam_ops[AMD_F12H];
+ break;
default:
Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
return;
Index: mcelog-189/amd.h
===================================================================
--- mcelog-189.orig/amd.h
+++ mcelog-189/amd.h
@@ -9,6 +9,7 @@ enum amdcpu {
AMD_K8 = 0,
AMD_F10H,
AMD_F11H,
+ AMD_F12H,
AMD_F14H,
AMD_F15H,
AMD_F16H,

137
add-f14h-support.patch Normal file
View File

@ -0,0 +1,137 @@
Add F14h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
amd.h | 3 +-
mcelog.c | 2 +
mcelog.h | 1
4 files changed, 93 insertions(+), 1 deletion(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -159,6 +159,8 @@ enum cputype select_amd_cputype(u32 fami
return CPU_F11H;
case 0x12:
return CPU_F12H;
+ case 0x14:
+ return CPU_F14H;
default:
break;
}
@@ -381,6 +383,58 @@ static bool k8_mc0_mce(u16 ec, u8 xec)
return f10h_mc0_mce(ec, xec);
}
+static bool cat_mc0_mce(u16 ec, u8 xec)
+{
+ u8 r4 = R4(ec);
+ bool ret = true;
+
+ if (MEM_ERROR(ec)) {
+
+ if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
+ return false;
+
+ switch (r4) {
+ case R4_DRD:
+ case R4_DWR:
+ Wprintf("Data/Tag parity error due to %s.\n",
+ (r4 == R4_DRD ? "load/hw prf" : "store"));
+ break;
+ case R4_EVICT:
+ Wprintf("Copyback parity error on a tag miss.\n");
+ break;
+ case R4_SNOOP:
+ Wprintf("Tag parity error during snoop.\n");
+ break;
+ default:
+ ret = false;
+ }
+ } else if (BUS_ERROR(ec)) {
+
+ if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
+ return false;
+
+ Wprintf("System read data error on a ");
+
+ switch (r4) {
+ case R4_RD:
+ Wprintf("TLB reload.\n");
+ break;
+ case R4_DWR:
+ Wprintf("store.\n");
+ break;
+ case R4_DRD:
+ Wprintf("load.\n");
+ break;
+ default:
+ ret = false;
+ }
+ } else {
+ ret = false;
+ }
+
+ return ret;
+}
+
static void decode_mc0_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -402,6 +456,31 @@ static void decode_mc0_mce(struct amd_de
Eprintf("Corrupted MC0 MCE info?\n");
}
+static bool cat_mc1_mce(u16 ec, u8 xec)
+{
+ u8 r4 = R4(ec);
+ bool ret = true;
+
+ if (!MEM_ERROR(ec))
+ return false;
+
+ if (TT(ec) != TT_INSTR)
+ return false;
+
+ if (r4 == R4_IRD)
+ Wprintf("Data/tag array parity error for a tag hit.\n");
+ else if (r4 == R4_SNOOP)
+ Wprintf("Tag error during snoop/victimization.\n");
+ else if (xec == 0x0)
+ Wprintf("Tag parity error from victim castout.\n");
+ else if (xec == 0x2)
+ Wprintf("Microcode patch RAM parity error.\n");
+ else
+ ret = false;
+
+ return ret;
+}
+
static void decode_mc1_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -656,6 +735,12 @@ struct amd_decoder_ops fam_ops[] = {
.mc1_mce = k8_mc1_mce,
.mc2_mce = k8_mc2_mce,
},
+ [AMD_F14H] = {
+ .cpu = AMD_F14H,
+ .mc0_mce = cat_mc0_mce,
+ .mc1_mce = cat_mc1_mce,
+ .mc2_mce = k8_mc2_mce,
+ },
};
static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
@@ -672,6 +757,9 @@ static void __decode_amd_mc(enum cputype
case CPU_F12H:
ops = &fam_ops[AMD_F12H];
break;
+ case CPU_F14H:
+ ops = &fam_ops[AMD_F14H];
+ break;
default:
Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
return;

223
add-f15h-support.patch Normal file
View File

@ -0,0 +1,223 @@
Add F15h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
amd.h | 3 -
mcelog.c | 2
mcelog.h | 1
4 files changed, 165 insertions(+), 1 deletion(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -72,6 +72,43 @@ static char *nbextendederr[] = {
"L3 Cache LRU Error"
};
+static const char * const f15h_mc1_mce_desc[] = {
+ "UC during a demand linefill from L2",
+ "Parity error during data load from IC",
+ "Parity error for IC valid bit",
+ "Main tag parity error",
+ "Parity error in prediction queue",
+ "PFB data/address parity error",
+ "Parity error in the branch status reg",
+ "PFB promotion address error",
+ "Tag error during probe/victimization",
+ "Parity error for IC probe tag valid bit",
+ "PFB non-cacheable bit parity error",
+ "PFB valid bit parity error", /* xec = 0xd */
+ "Microcode Patch Buffer", /* xec = 010 */
+ "uop queue",
+ "insn buffer",
+ "predecode buffer",
+ "fetch address FIFO"
+};
+
+static const char * const f15h_mc2_mce_desc[] = {
+ "Fill ECC error on data fills", /* xec = 0x4 */
+ "Fill parity error on insn fills",
+ "Prefetcher request FIFO parity error",
+ "PRQ address parity error",
+ "PRQ data parity error",
+ "WCC Tag ECC error",
+ "WCC Data ECC error",
+ "WCB Data parity error",
+ "VB Data ECC or parity error",
+ "L2 Tag ECC error", /* xec = 0x10 */
+ "Hard L2 Tag ECC error",
+ "Multiple hits on L2 tag",
+ "XAB parity error",
+ "PRB address parity error"
+};
+
static const char * const mc4_mce_desc[] = {
"DRAM ECC error detected on the NB",
"CRC error detected on HT link",
@@ -161,6 +198,8 @@ enum cputype select_amd_cputype(u32 fami
return CPU_F12H;
case 0x14:
return CPU_F14H;
+ case 0x15:
+ return CPU_F15H;
default:
break;
}
@@ -435,6 +474,53 @@ static bool cat_mc0_mce(u16 ec, u8 xec)
return ret;
}
+static bool f15h_mc0_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+
+ if (MEM_ERROR(ec)) {
+
+ switch (xec) {
+ case 0x0:
+ Wprintf("Data Array access error.\n");
+ break;
+
+ case 0x1:
+ Wprintf("UC error during a linefill from L2/NB.\n");
+ break;
+
+ case 0x2:
+ case 0x11:
+ Wprintf("STQ access error.\n");
+ break;
+
+ case 0x3:
+ Wprintf("SCB access error.\n");
+ break;
+
+ case 0x10:
+ Wprintf("Tag error.\n");
+ break;
+
+ case 0x12:
+ Wprintf("LDQ access error.\n");
+ break;
+
+ default:
+ ret = false;
+ }
+ } else if (BUS_ERROR(ec)) {
+
+ if (!xec)
+ Wprintf("System Read Data Error.\n");
+ else
+ Wprintf(" Internal error condition type %d.\n", xec);
+ } else
+ ret = false;
+
+ return ret;
+}
+
static void decode_mc0_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -481,6 +567,36 @@ static bool cat_mc1_mce(u16 ec, u8 xec)
return ret;
}
+static bool f15h_mc1_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+
+ if (!MEM_ERROR(ec))
+ return false;
+
+ switch (xec) {
+ case 0x0 ... 0xa:
+ Wprintf("%s.\n", f15h_mc1_mce_desc[xec]);
+ break;
+
+ case 0xd:
+ Wprintf("%s.\n", f15h_mc1_mce_desc[xec-2]);
+ break;
+
+ case 0x10:
+ Wprintf("%s.\n", f15h_mc1_mce_desc[xec-4]);
+ break;
+
+ case 0x11 ... 0x14:
+ Wprintf("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
+ break;
+
+ default:
+ ret = false;
+ }
+ return ret;
+}
+
static void decode_mc1_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -537,6 +653,40 @@ static bool k8_mc2_mce(u16 ec, u8 xec)
return ret;
}
+static bool f15h_mc2_mce(u16 ec, u8 xec)
+{
+ bool ret = true;
+
+ if (TLB_ERROR(ec)) {
+ if (xec == 0x0)
+ Wprintf("Data parity TLB read error.\n");
+ else if (xec == 0x1)
+ Wprintf("Poison data provided for TLB fill.\n");
+ else
+ ret = false;
+ } else if (BUS_ERROR(ec)) {
+ if (xec > 2)
+ ret = false;
+
+ Wprintf("Error during attempted NB data read.\n");
+ } else if (MEM_ERROR(ec)) {
+ switch (xec) {
+ case 0x4 ... 0xc:
+ Wprintf("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
+ break;
+
+ case 0x10 ... 0x14:
+ Wprintf("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
+ break;
+
+ default:
+ ret = false;
+ }
+ }
+
+ return ret;
+}
+
static void decode_mc2_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -741,6 +891,12 @@ struct amd_decoder_ops fam_ops[] = {
.mc1_mce = cat_mc1_mce,
.mc2_mce = k8_mc2_mce,
},
+ [AMD_F15H] = {
+ .cpu = AMD_F15H,
+ .mc0_mce = f15h_mc0_mce,
+ .mc1_mce = f15h_mc1_mce,
+ .mc2_mce = f15h_mc2_mce,
+ },
};
static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
@@ -760,6 +916,10 @@ static void __decode_amd_mc(enum cputype
case CPU_F14H:
ops = &fam_ops[AMD_F14H];
break;
+ case CPU_F15H:
+ xec_mask = 0x1f;
+ ops = &fam_ops[AMD_F15H];
+ break;
default:
Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
return;

95
add-f16h-support.patch Normal file
View File

@ -0,0 +1,95 @@
Add F16h decoding support
Signed-off-by: Borislav Petkov <bp@suse.de>
---
amd.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
amd.h | 3 ++-
mcelog.c | 2 ++
mcelog.h | 1 +
4 files changed, 58 insertions(+), 1 deletion(-)
Index: mcelog-189/amd.c
===================================================================
--- mcelog-189.orig/amd.c
+++ mcelog-189/amd.c
@@ -200,6 +200,8 @@ enum cputype select_amd_cputype(u32 fami
return CPU_F14H;
case 0x15:
return CPU_F15H;
+ case 0x16:
+ return CPU_F16H;
default:
break;
}
@@ -687,6 +689,47 @@ static bool f15h_mc2_mce(u16 ec, u8 xec)
return ret;
}
+static bool f16h_mc2_mce(u16 ec, u8 xec)
+{
+ u8 r4 = R4(ec);
+
+ if (!MEM_ERROR(ec))
+ return false;
+
+ switch (xec) {
+ case 0x04 ... 0x05:
+ Wprintf("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
+ break;
+
+ case 0x09 ... 0x0b:
+ case 0x0d ... 0x0f:
+ Wprintf("ECC error in L2 tag (%s).\n",
+ ((r4 == R4_GEN) ? "BankReq" :
+ ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
+ break;
+
+ case 0x10 ... 0x19:
+ case 0x1b:
+ Wprintf("ECC error in L2 data array (%s).\n",
+ (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
+ ((r4 == R4_GEN) ? "Attr" :
+ ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
+ break;
+
+ case 0x1c ... 0x1d:
+ case 0x1f:
+ Wprintf("Parity error in L2 attribute bits (%s).\n",
+ ((r4 == R4_RD) ? "Hit" :
+ ((r4 == R4_GEN) ? "Attr" : "Fill")));
+ break;
+
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static void decode_mc2_mce(struct amd_decoder_ops *ops, struct mce *m)
{
u16 ec = EC(m->status);
@@ -897,6 +940,12 @@ struct amd_decoder_ops fam_ops[] = {
.mc1_mce = f15h_mc1_mce,
.mc2_mce = f15h_mc2_mce,
},
+ [AMD_F16H] = {
+ .cpu = AMD_F16H,
+ .mc0_mce = cat_mc0_mce,
+ .mc1_mce = cat_mc1_mce,
+ .mc2_mce = f16h_mc2_mce,
+ },
};
static void __decode_amd_mc(enum cputype cpu, struct mce *mce)
@@ -920,6 +969,10 @@ static void __decode_amd_mc(enum cputype
xec_mask = 0x1f;
ops = &fam_ops[AMD_F15H];
break;
+ case CPU_F16H:
+ xec_mask = 0x1f;
+ ops = &fam_ops[AMD_F16H];
+ break;
default:
Eprintf("Huh? What family is it: 0x%x?!\n", cpu);
return;

30
add_new_amd_cpu_defines Normal file
View File

@ -0,0 +1,30 @@
Index: mcelog-189/mkcputype
===================================================================
--- mcelog-189.orig/mkcputype
+++ mcelog-189/mkcputype
@@ -5,6 +5,12 @@ awk -F\| 'BEGIN {
print "enum cputype {" > "cputype.tmp"
print "\tCPU_GENERIC," > "cputype.tmp"
print "\tCPU_K8," > "cputype.tmp"
+ print "\tCPU_F10H," > "cputype.tmp"
+ print "\tCPU_F11H," > "cputype.tmp"
+ print "\tCPU_F12H," > "cputype.tmp"
+ print "\tCPU_F14H," > "cputype.tmp"
+ print "\tCPU_F15H," > "cputype.tmp"
+ print "\tCPU_F16H," > "cputype.tmp"
print "\n\n/* Insert any new non-intel CPU models before this line */\n\n" > "cputype.tmp"
print "\tCPU_INTEL," > "cputype.tmp"
@@ -44,6 +50,12 @@ END {
print "char *cputype_name[] = {" > "lookup_intel_cputype.tmp"
print "\t[CPU_GENERIC] = \"generic CPU\"," > "lookup_intel_cputype.tmp"
print "\t[CPU_K8] = \"AMD K8 and derivates\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F10H] = \"AMD Greyhound\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F11H] = \"AMD Griffin\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F12H] = \"AMD Llano\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F14H] = \"AMD Bobcat\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F15H] = \"AMD Bulldozer\"," > "lookup_intel_cputype.tmp"
+ print "\t[CPU_F16H] = \"AMD Jaguar\"," > "lookup_intel_cputype.tmp"
print "\t[CPU_INTEL] = \"Intel generic architectural MCA\"," > "lookup_intel_cputype.tmp"
print "\t[CPU_P4] = \"Intel P4\"," > "lookup_intel_cputype.tmp"
print "\t[CPU_TULSA] = \"Intel Xeon 7100 series\"," > "lookup_intel_cputype.tmp"

510
email.patch Normal file
View File

@ -0,0 +1,510 @@
---
Makefile | 13 +++-
email.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
email.h | 34 ++++++++++
mcelog.c | 93 ++++++++++++++++++++++++++++-
mcelog.h | 1
msg.c | 8 ++
6 files changed, 346 insertions(+), 3 deletions(-)
Index: mcelog-195/Makefile
===================================================================
--- mcelog-195.orig/Makefile
+++ mcelog-195/Makefile
@@ -1,3 +1,4 @@
+CONFIG_EMAIL := 1
CFLAGS := -g -Os
prefix := /usr
etcprefix :=
@@ -38,16 +39,24 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co
broadwell_de.o broadwell_epex.o skylake_xeon.o \
denverton.o i10nm.o sapphire.o granite.o \
msr.o bus.o unknown.o lookup_intel_cputype.o
+EMAIL_OBJ := email.o
CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o \
version.o version.c version.tmp cputype.h cputype.tmp \
- lookup_intel_cputype.c lookup_intel_cputype.tmp
+ lookup_intel_cputype.c lookup_intel_cputype.tmp ${EMAIL_OBJ}
DOC := mce.pdf
ADD_DEFINES :=
+ifdef CONFIG_EMAIL
+ADD_DEFINES := -DCONFIG_EMAIL=1
+LIBS := -lesmtp
+OBJ += ${EMAIL_OBJ}
+endif
+
SRC := $(OBJ:.o=.c)
mcelog: ${OBJ} version.o
+ $(CC) $(LDFLAGS) $^ ${LIBS} -o $@
# dbquery intentionally not installed by default
install: install-nodoc mcelog.conf.5 mcelog.triggers.5
@@ -85,7 +94,7 @@ dbquery: db.o dbquery.o memutil.o
depend: .depend
%.o: %.c
- $(CC) -c $(CFLAGS) $(CPPFLAGS) $(WARNINGS) $(ADD_DEFINES) -o $@ $<
+ $(CC) -c $(CFLAGS) $(CPPFLAGS) $(WARNINGS) $(ADD_DEFINES) $< -o $@
version.tmp: FORCE
( printf "char version[] = \"" ; \
Index: mcelog-195/email.c
===================================================================
--- /dev/null
+++ mcelog-195/email.c
@@ -0,0 +1,200 @@
+#include <unistd.h>
+#include <signal.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define __USE_GNU
+/* To fetch the dnsname */
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+
+#include <libesmtp.h>
+#include "mcelog.h"
+#include "email.h"
+
+#define MAX_STRING_LEN 512
+char c_recipient[MAX_STRING_LEN] = "";
+static int debug;
+static char dnsname[MAX_STRING_LEN];
+
+static char buf[128];
+#define ERROR() { fprintf (stderr, "SMTP problem [%d] %s\n", __LINE__, \
+ smtp_strerror (smtp_errno (), buf, sizeof buf)); \
+ return -1; }
+
+
+void email_usage(void) {
+ fprintf(stderr,
+ "--email address Requires daemon mode\n");
+}
+
+int email_cmd(int opt, int ac, char **av)
+{
+ char *arg = optarg;
+
+ switch (opt) {
+ case O_EMAIL_ADDRESS:
+ if (arg) {
+ if (strlen(arg) >= MAX_STRING_LEN) {
+ Eprintf("email address too long"
+ " [max:%d]\n", MAX_STRING_LEN);
+ return 0;
+ }
+ strcpy(c_recipient, arg);
+ return 1;
+ }
+ case O_EMAIL_DEBUG:
+ debug = 1;
+ return 0;
+ }
+ return 0;
+}
+
+int email_env(void)
+{
+ char *email_env = getenv("MCELOG_EMAIL_DEBUG");
+
+ if (email_env)
+ debug=0;
+
+ email_env = getenv("MCELOG_ADMIN_EMAIL");
+ /* No email validation, but at least check for not being empty... */
+ if (email_env && strlen(email_env) > 1) {
+ strncpy(c_recipient, email_env, MAX_STRING_LEN - 1);
+ return 1;
+ }
+ return 0;
+}
+
+/* Callback to prnt the recipient status */
+static void
+print_recipient_status (smtp_recipient_t recipient,
+ const char *mailbox, void *arg)
+{
+ const smtp_status_t *status;
+
+ status = smtp_recipient_status (recipient);
+ if (debug)
+ printf ("%s: %d %s", mailbox, status->code, status->text);
+}
+
+void setup_mail_header(FILE *fp, struct mce *m)
+{
+ char host[MAX_STRING_LEN];
+ struct addrinfo hints;
+ struct addrinfo *res=NULL;
+ int ret, retry=3;
+
+ /* Taken from net-tools hostname.c showhname() */
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_flags = AI_CANONNAME | AI_CANONIDN;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = 0;
+
+ if (gethostname(host, MAX_STRING_LEN)) {
+ fprintf(stderr, "Cannot get host name\n");
+ return;
+ }
+
+ do {
+ ret = getaddrinfo(host, NULL, &hints, &res);
+ } while(ret == EAI_AGAIN && retry-- > 0
+ && usleep(50000) == 0);
+
+ if (ret != 0 || res == NULL) {
+ fprintf(stderr, "Could not retrieve hostname\n");
+ return;
+ }
+
+ memset(dnsname, '\0', MAX_STRING_LEN);
+ strncpy(dnsname, res->ai_canonname, MAX_STRING_LEN - 1);
+
+ fprintf(fp, "Return-Path: <dummy@will_get_overridden.net>\r\n"
+ "Subject: Machine Check Exception on %s detected\r\n"
+ "MIME-Version: 1.0\r\n"
+ "Content-Type: text/plain;\r\n"
+ " charset=iso-8859-1\r\n"
+ "Content-Transfer-Encoding: 7bit\r\n\r\n", dnsname);
+ freeaddrinfo(res);
+}
+
+
+int send_mail(FILE *fp)
+{
+ char smtp_host[MAX_STRING_LEN] = "localhost:25";
+ char from[MAX_STRING_LEN];
+
+ const smtp_status_t *status;
+ smtp_session_t session;
+ smtp_message_t message;
+ smtp_recipient_t recipient;
+ struct sigaction sa;
+
+ session = smtp_create_session ();
+ message = smtp_add_message (session);
+
+ snprintf(from, MAX_STRING_LEN, "root@%s", dnsname);
+
+ /* NB. libESMTP sets timeouts as it progresses through the protocol.
+ In addition the remote server might close its socket on a timeout.
+ Consequently libESMTP may sometimes try to write to a socket with
+ no reader. Ignore SIGPIPE, then the program doesn't get killed
+ if/when this happens. */
+ sa.sa_handler = SIG_IGN;
+ sigemptyset (&sa.sa_mask);
+ sa.sa_flags = 0;
+ sigaction (SIGPIPE, &sa, NULL);
+
+ /* Set the host running the SMTP server. LibESMTP has a default port
+ number of 587, however this is not widely deployed so the port
+ is specified as 25 along with the default MTA host. */
+ if (!smtp_set_server (session, smtp_host))
+ ERROR();
+
+ smtp_set_reverse_path (message, from);
+
+ /* RFC 2822 doesn't require recipient headers but a To: header would
+ be nice to have if not present. */
+ smtp_set_header (message, "To", NULL, NULL);
+
+ /* RFC 2822 doesn't require recipient headers but a To: header would
+ be nice to have if not present. */
+ if (!smtp_set_header (message, "From", "mcelog", from))
+ ERROR();
+
+ smtp_set_message_fp (message, fp);
+
+ recipient = smtp_add_recipient (message, c_recipient);
+ if (!recipient)
+ ERROR();
+ if (!smtp_dsn_set_notify (recipient, Notify_NEVER))
+ ERROR();
+
+ /* Initiate a connection to the SMTP server and transfer the
+ message. */
+ if (!smtp_start_session (session))
+ Eprintf("SMTP server problem %s\n",
+ smtp_strerror (smtp_errno (), buf, sizeof buf));
+ else {
+ /* Report on the success or otherwise of the mail transfer.
+ */
+ if (debug) {
+ status = smtp_message_transfer_status (message);
+ printf ("%d %s", status->code,
+ (status->text != NULL) ? status->text : "\n");
+ }
+ smtp_enumerate_recipients (message, print_recipient_status, NULL);
+ }
+
+ if (debug)
+ fprintf(stderr, "Email sent successfully!\n");
+
+ /* Free resources consumed by the program.
+ */
+ smtp_destroy_session (session);
+ return 0;
+}
Index: mcelog-195/email.h
===================================================================
--- /dev/null
+++ mcelog-195/email.h
@@ -0,0 +1,34 @@
+#ifndef _MCELOG_EMAIL_H_
+#define _MCELOG_EMAIL_H_
+
+extern FILE *email_fd;
+extern int email_mode;
+
+#ifdef CONFIG_EMAIL
+extern int send_mail(FILE *email_fd);
+extern void setup_mail_header(FILE *email_fd, struct mce *m);
+extern void email_usage(void);
+extern int email_cmd(int opt, int ac, char **av);
+extern int email_env(void);
+
+#define EMAIL_OPTIONS \
+ { "email", 1, NULL, O_EMAIL_ADDRESS }, \
+ { "email-debug", 0, NULL, O_EMAIL_DEBUG },
+
+enum email_options {
+ O_EMAIL_ADDRESS = O_EMAIL,
+ O_EMAIL_DEBUG,
+};
+
+#else
+/*
+static int send_mail(FILE *email_fd) { return 0; }
+static void setup_mail_header(FILE *email_fd) { return; };
+*/
+static void email_usage(void) { return; }
+static int email_cmd(int opt, int ac, char **av) { return 0; }
+static int email_env(void) { return 0; }
+#define EMAIL_OPTIONS
+#endif
+
+#endif
Index: mcelog-195/mcelog.c
===================================================================
--- mcelog-195.orig/mcelog.c
+++ mcelog-195/mcelog.c
@@ -37,6 +37,7 @@
#include <assert.h>
#include <signal.h>
#include <pwd.h>
+#include <sys/wait.h>
#include <fnmatch.h>
#include "mcelog.h"
#include "paths.h"
@@ -60,6 +61,9 @@
#include "bus.h"
#include "unknown.h"
+#include "email.h"
+int email_mode;
+
enum cputype cputype = CPU_GENERIC;
char *logfn = LOG_DEV_FILENAME;
@@ -71,7 +75,7 @@ static double cpumhz;
static int cpumhz_forced;
int ascii_mode;
int dump_raw_ascii;
-int daemon_mode;
+int daemon_mode = 0;
static char *inputfile;
char *processor_flags;
static int foreground;
@@ -906,6 +910,7 @@ void usage(void)
"--max-corr-err-counters Max page correctable error counters\n"
"--help Display this message.\n"
);
+ email_usage();
printf("\n");
print_cputypes();
}
@@ -977,6 +982,7 @@ static struct option options[] = {
{ "max-corr-err-counters", 1, NULL, O_MAX_CORR_ERR_COUNTERS },
{ "help", 0, NULL, O_HELP },
{ "is-cpu-supported", 0, NULL, O_IS_CPU_SUPPORTED },
+ EMAIL_OPTIONS
{}
};
@@ -1171,11 +1177,86 @@ static void drop_cred(void)
}
}
+#ifdef CONFIG_EMAIL
+pid_t c_pid;
+
+/* Not more than 12 mails in 5 mins... */
+#define LAST_LIMIT_COUNT (60 * 5)
+#define LIMIT_COUNT 12
+static time_t last_limit_count;
+static int limit_count;
+static const char *mail_thread = "mcelog_mail_thread";
+
+
+static int setup_email(struct mce *m) {
+ int pdes[2];
+ static int suppressed;
+ int ret;
+
+ if (time(NULL) - last_limit_count < LAST_LIMIT_COUNT) {
+ if (limit_count >= LIMIT_COUNT && !suppressed) {
+ Eprintf("email rate limit [%d mails per %d mins]"
+ " reached, mails supressed\n",
+ LIMIT_COUNT, LAST_LIMIT_COUNT / 60);
+ suppressed = 1;
+ }
+ if (suppressed)
+ return -1;
+ } else {
+ suppressed = 0;
+ limit_count = 0;
+ last_limit_count = time(NULL);
+ }
+
+ limit_count++;
+
+ ret = pipe(pdes);
+ if (ret)
+ return ret;
+
+ c_pid = mcelog_fork(mail_thread);
+ if ( c_pid == 0 ) { /* child */
+ FILE *x = fdopen(pdes[0], "r");
+ close(pdes[1]);
+ send_mail(x);
+ exit(0);
+ } else {
+ close(pdes[0]);
+ /* something went wrong, better close... */
+ if (email_fd)
+ fclose(email_fd);
+ /* Wprintf will now also write into this pipe */
+ email_fd = fdopen(pdes[1], "w");
+ setup_mail_header(email_fd, m);
+ }
+ return 0;
+}
+
+static int finish_email(void) {
+ int status;
+
+ fclose(email_fd);
+ fprintf(stderr, "Email set up for sending\n");
+ /* Anything else we can make sure we do not get orphaned threads? */
+ waitpid (c_pid, &status, WUNTRACED);
+ if (WIFSTOPPED(status)){
+ kill(c_pid, 9);
+ SYSERRprintf("Killed stopped email thread %d\n",
+ c_pid);
+ return -1;
+ }
+ email_fd = NULL;
+ return 0;
+}
+
+#endif
+
static void process(int fd, unsigned recordlen, unsigned loglen, char *buf)
{
int i;
int len, count;
int finish = 0, flags;
+ int mail_setup = 0;
if (recordlen == 0) {
Wprintf("no data in mce record\n");
@@ -1202,12 +1283,16 @@ static void process(int fd, unsigned rec
finish = 1;
if (!mce_filter(mce, recordlen))
continue;
+ if (email_mode)
+ mail_setup = setup_email(mce);
if (!dump_raw_ascii) {
disclaimer();
Wprintf("MCE %d\n", i);
dump_mce(mce, recordlen);
} else
dump_mce_raw_ascii(mce, recordlen);
+ if (email_mode && !mail_setup)
+ finish_email();
flushlog();
}
@@ -1321,6 +1406,8 @@ int main(int ac, char **av)
noargs(ac, av);
fprintf(stderr, "mcelog %s\n", MCELOG_VERSION);
exit(0);
+ } else if (email_cmd(opt, ac, av)) {
+ email_mode = 1;
} else if (opt == 0)
break;
}
@@ -1355,6 +1442,10 @@ int main(int ac, char **av)
usage();
exit(1);
}
+ if (email_mode == 0)
+ email_mode = email_env();
+ /* email sending only in daemon mode */
+ email_mode &= daemon_mode;
checkdmi();
general_setup();
Index: mcelog-195/mcelog.h
===================================================================
--- mcelog-195.orig/mcelog.h
+++ mcelog-195/mcelog.h
@@ -118,6 +118,7 @@ extern int open_logfile(char *fn);
enum option_ranges {
O_COMMON = 500,
O_DISKDB = 1000,
+ O_EMAIL = 1500,
};
enum syslog_opt {
Index: mcelog-195/msg.c
===================================================================
--- mcelog-195.orig/msg.c
+++ mcelog-195/msg.c
@@ -8,10 +8,13 @@
#include "mcelog.h"
#include "msg.h"
#include "memutil.h"
+#include "email.h"
+
enum syslog_opt syslog_opt = SYSLOG_REMARK;
int syslog_level = LOG_WARNING;
static FILE *output_fh;
+ FILE *email_fd;
static char *output_fn;
int need_stdout(void)
@@ -135,6 +138,11 @@ int Wprintf(char *fmt, ...)
n = vfprintf(output_fh ? output_fh : stdout, fmt, ap);
va_end(ap);
}
+ if (email_fd) {
+ va_start(ap,fmt);
+ n = vfprintf(email_fd, fmt, ap);
+ va_end(ap);
+ }
return n;
}

View File

@ -0,0 +1,31 @@
---
mcelog.c | 9 +++++++++
1 file changed, 9 insertions(+)
Index: mcelog-189/mcelog.c
===================================================================
--- mcelog-189.orig/mcelog.c
+++ mcelog-189/mcelog.c
@@ -37,6 +37,7 @@
#include <assert.h>
#include <signal.h>
#include <pwd.h>
+#include <grp.h>
#include <sys/wait.h>
#include <fnmatch.h>
#include "mcelog.h"
@@ -1155,6 +1156,14 @@ static void general_setup(void)
static void drop_cred(void)
{
+ /* When dropping privileges from root, the `setgroups` call will
+ * remove any extraneous groups. If we don't call this, then
+ * even though our uid has dropped, we may still have groups
+ * that enable us to do super-user things. This will fail if we
+ * aren't root, so don't bother checking the return value, this
+ * is just done as an optimistic privilege dropping function.
+ */
+ setgroups(0, NULL);
if (runcred.uid != -1U && runcred.gid == -1U) {
struct passwd *pw = getpwuid(runcred.uid);
if (pw)

BIN
mcelog-196.obscpio (Stored with Git LFS) Normal file

Binary file not shown.

13
mcelog-socket-path.patch Normal file
View File

@ -0,0 +1,13 @@
--- mcelog-1.0.1.orig/paths.h
+++ mcelog-1.0.1/paths.h
@@ -4,8 +4,8 @@
#define DIMM_DB_FILENAME PREFIX "/var/lib/memory-errors"
#define CONFIG_FILENAME PREFIX "/etc/mcelog/mcelog.conf"
-#define SOCKET_PATH "/var/run/mcelog-client"
+#define SOCKET_PATH "/run/mcelog/mcelog-client"
#define LOG_FILE "/var/log/mcelog"
-#define PID_FILE "/var/run/mcelog.pid"
+#define PID_FILE "/run/mcelog/mcelog.pid"

641
mcelog.changes Normal file
View File

@ -0,0 +1,641 @@
-------------------------------------------------------------------
Mon Nov 20 12:01:41 UTC 2023 - trenn@suse.de
- Update to version 196:
* mcelog: Add second model number for Arrowlake
-------------------------------------------------------------------
Tue Sep 12 14:08:37 UTC 2023 - trenn@suse.de
- This contains following features:
PED-6122
[GNR] RAS: mcelog Add support for Granite Rapids (ALP)
PED-6102
[GNR] RAS: mcelog Add support for Granite Rapids (SLE 15 SP6)
PED-6021
[SRF] RAS: mcelog support for Sierra Forest (SLE 15 SP6)
PED-6050
[SRF] RAS: mcelog support for Sierra Forest (ALP)
- Change git repo in _service file from git to https url
- Update to version 195:
* mcelog: Wire up model-specific decoding for Sierra Forest
* mcelog: Add model-specific decoding for Granite Rapids
* client.c: fix build w/ musl libc
* mcelog: New model number for Arrowlake
* mcelog: Don't overwrite model number when lookup fails
* mcelog: Add Graniterapids, Grandridge and Sierraforest
* mcelog: New model number for Lunarlake
* mcelog: Add Emerald Rapids
* Update PFA_test_howto
- Adopt to mainline:
M email.patch
-------------------------------------------------------------------
Wed Jun 14 14:58:43 UTC 2023 - trenn@suse.de
- Update to version 194 (jsc#PED-4218):
* client.c: fix build w/ musl libc
* mcelog: New model number for Arrowlake
* mcelog: Don't overwrite model number when lookup fails
* mcelog: Add Graniterapids, Grandridge and Sierraforest
* mcelog: New model number for Lunarlake
* mcelog: Add Emerald Rapids
* mcelog: Add decode support for Sapphire Rapids
* Update PFA_test_howto
* mcelog: Add support for Meteor Lake
-------------------------------------------------------------------
Thu Oct 06 14:56:44 UTC 2022 - trenn@suse.de
- Includes following SLE 15 SP5 jira features:
* jsc#PED-671 mcelog: Update to latest release
* jsc#PED-686 [CPU Features] Update mcelog support for ADL-N
* jsc#PED-638 [CPU Features] Update mcelog support for MTL-P
- Update to version 189:
* mcelog: Add another Raptor Lake CPU model
* Fix generation of cputype files
* mcelog: Add missing model numbers for Broadwell and Raptorlake
* mcelog: Makefile: Only touch cputype.h if needed to create it
* Makefile: add install-nodoc target
* Use env as the shebang target
* Add missing dependencies for cputype include files
* mcelog: Reverse sens of check to call resolveaddr()
* mcelog: Reverse the sense of the check to set memory_error_support
* mcelog: Drop CASE_INTEL define
* mcelog: Generate cpu_choices[] from table
* mcelog: Generate the cputype_name[] array from the table
* mcelog: Add CPU model numbers to table and generate switch function
* mcelog: Generate CPU_* enums from a table
* mcelog: Add two more Alderlake model numbers
* mcelog: Reduce default threshold for corrected error page offline
* Make genconfig use python3
* mcelog: Add support for Raptorlake
* Fix warnings in sysfs.c
* mcelog: Change "DDR4" string to "DDR" for i10nm platforms
* Fix logrotate syntax
* remove outdated mcelog.conf.5 manual file
* add furture print function for Python2
* fix python errors in genconfig.py
* fix the buf not freed in read_field
* mcelog: Print warning for locked down kernel
* mcelog: Handle sysfs files without length
- Had to adopt to latest CPU identification model
mainline patch:
b54ee05056a76e mcelog: Drop CASE_INTEL define
and friends
A add_new_amd_cpu_defines
D add-defines.patch
M Start-consolidating-AMD-specific-stuff.patch
M add-f10h-support.patch
M add-f11h-support.patch
M add-f12h-support.patch
M add-f14h-support.patch
M add-f15h-support.patch
M add-f16h-support.patch
M email.patch
M fix_setgroups_missing_call.patch
-------------------------------------------------------------------
Tue May 03 11:32:42 UTC 2022 - moritz.kodytek@suse.com
- Update to version 181:
* mcelog: Add support for Raptorlake
- Adopt patches to latest git version
M Start-consolidating-AMD-specific-stuff.patch
M add-f10h-support.patch
M add-f11h-support.patch
M add-f12h-support.patch
M add-f14h-support.patch
M add-f15h-support.patch
M add-f16h-support.patch
M email.patch
M fix_setgroups_missing_call.patch
M mcelog_invert_prefill_db_warning.patch
- Use Python3 shebang instead of python
A python3_shebang
- Use Github URL
-------------------------------------------------------------------
Wed Apr 13 12:44:57 UTC 2022 - moritz.kodytek@suse.com
- Update to version 180:
* Fix warnings in sysfs.c
* mcelog: Change "DDR4" string to "DDR" for i10nm platforms
* Fix logrotate syntax
* remove outdated mcelog.conf.5 manual file
* add furture print function for Python2
* fix python errors in genconfig.py
* fix the buf not freed in read_field
* mcelog: Print warning for locked down kernel
* mcelog: Handle sysfs files without length
* Fix make test fail
-------------------------------------------------------------------
Wed Sep 01 14:30:27 UTC 2021 - trenn@suse.de
- Update to version 178:
* mcelog: Fix typo/thinko in yellow cache change
-------------------------------------------------------------------
Mon Jul 19 13:44:53 UTC 2021 - trenn@suse.de
- Update to version 177:
* README: Mark up filename as code/monospace
* README: Correct filename of `.os_version`
-------------------------------------------------------------------
Fri Jul 09 13:29:25 UTC 2021 - trenn@suse.de
- Update to version 177 (jsc#SLE-18903):
* mcelog: Update MSCOD error bit descriptions to match SDM
* mcelog: Fix issues with "yellow" cache offlining
* Add reference to Linux::MCELog
* test: avoid the pfa test hang
- Add _service git magic
-------------------------------------------------------------------
Thu Apr 1 16:25:59 UTC 2021 - Yaroslav Kurlaev <yaroslav.kurlaev@gmail.com>
- Remove deprecated "StandardOutput=syslog" option from the systemd
unit file to remove a warning from systemd. (bsc#1185151)
-------------------------------------------------------------------
Tue Jan 26 17:43:06 UTC 2021 - trenn@suse.de
- Update to version 175 (jsc#SLE-14450):
* mcelog: Add a test case to test page error counter replacement.
* mcelog: Use 'num-errors' to specify the number of mce records to be injected.
* mcelog: Report how often the replacement of page CE counter happened
* mcelog: Limit memory consumption for counting CEs per page
* mcelog: Add support for Sapphirerapids server. (jsc#SLE-14450)
* mcelog: i10nm: Fix mapping from bank number to functional unit
- Only refreshing patches, due to tarball modifications:
M Start-consolidating-AMD-specific-stuff.patch
M add-f10h-support.patch
M add-f11h-support.patch
M add-f12h-support.patch
M add-f14h-support.patch
M add-f15h-support.patch
M add-f16h-support.patch
M email.patch
M fix_setgroups_missing_call.patch
M mcelog_invert_prefill_db_warning.patch
-------------------------------------------------------------------
Mon Sep 28 10:16:15 UTC 2020 - trenn@suse.de
- jsc#SLE-13505, jsc#SLE-13494
- Update to version 173:
* mcelog: Rebalance the red-black tree after inserting a new node
* mcelog: Add Tigerlake, Rocketlake, Alderlake, Lakefield
* mcelog.service: Check existence of `/dev/mcelog` in systemd
* mcelog.service: Remove DefaultStandardOutput configuration
* mcelog: Add decode for MCi_MISC from 10nm memory controller
* Add reporter tracking to trigger-invoking functions.
* mcelog: Add "kflags" field to "struct mce"
-------------------------------------------------------------------
Sun Aug 16 19:01:17 UTC 2020 - Dirk Mueller <dmueller@suse.com>
- update to 170:
* mcelog: Add Cometlake client model numbers
* mcelog: Do not start mcelog service if edac_mce_amd module is loaded
* mcelog: Decode and print stepping from cpuid
* mcelog: Add "kflags" field to "struct mce"
* Add reporter tracking to trigger-invoking functions.
* mcelog: Add decode for MCi_MISC from 10nm memory controller
- covers:
* jsc#SLE-12689
-------------------------------------------------------------------
Wed Nov 20 14:00:53 UTC 2019 - trenn@suse.de
- Update to version 1.66 (jira SLE-10087, jira SLE-8853):
* mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge
M email.patch
-> Patched with fuzz, refresh needed
-------------------------------------------------------------------
Tue Oct 29 15:57:54 UTC 2019 - trenn@suse.de
- Update to version 1.65:
* mcelog: Add Cascade Lake to supported models
-------------------------------------------------------------------
Fri Sep 13 16:04:20 UTC 2019 - Jean Delvare <jdelvare@suse.com>
- mcelog.systemd: Preload the dmi-sysfs kernel module.
When /dev/mem can't be read (which is the case when booting in
Secure Mode), mcelog can use the dmi-sysfs interface instead,
however for that the kernel module needs to be loaded first
(bsc#1149186).
-------------------------------------------------------------------
Fri Sep 06 11:25:34 UTC 2019 - MMuschner@suse.com
- Update to version 1.64:
* mcelog: Add Icelake client model numbers.
* add Hygon Dhyana support to not use mcelog, as Hygon Dhyana(0x18h) share similiar arch with AMD Family 17h
-------------------------------------------------------------------
Mon Mar 25 11:31:24 UTC 2019 - christian.voegl@suse.com
- Update to version 1.62:
* mcelog: Fix memory controller bank channel mappings for Skylake
* mcelog: update tests for new error code
* mcelog: Add decoding for Optane DC persistent memory mode
* mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
- Change mcelog.spec to use autosetup
-------------------------------------------------------------------
Fri Sep 21 15:52:28 UTC 2018 - opensuse-packaging@opensuse.org
(by trenn@suse.de)
- Update to version 1.60 (fate#326221):
* Turn back rb_color field into unsigned long
* trigger: add a sync argument for waiting trigger child process exit
* page: trigger: add pre/post sync trigger when doing soft memory offline
* fixed build errors for some lose code when merging code
* transfer the page address to pre/post-sync-trigger scripts
* mcelog: Fix "--ascii" parsing to cope with change in kernel output since v4.10
* Remove now unused local variable
* Add scripts file to do MCA error code validation for a selected CPU model
* Add license file
* mcelog: Improve decoding for APEI reported errors
-------------------------------------------------------------------
Thu Nov 23 13:40:46 UTC 2017 - rbrown@suse.com
- Replace references to /var/adm/fillup-templates with new
%_fillupdir macro (boo#1069468)
-------------------------------------------------------------------
Fri Jul 07 13:59:28 UTC 2017 - fschnizlein@suse.com
- Update to version 1.53:
* Add service file
* dmi: Handle NULL DMI string
* Compress some fields in mempage.
* Add coverity fixes
* Fix typo in man page
* mcelog: Check whether we successfully changed directory for trigger.
* mcelog version: Add ability for OS to define version
* Document .os_release in README
* Set SO_PASSCRED on listen sockets
* memutil.h: add missing include for va_list
-------------------------------------------------------------------
Mon Mar 20 14:28:54 UTC 2017 - trenn@suse.de
- Package also includes fixes for (through previous version updates below):
* Add mcelog-skylake.patch patch to support Skylake Xeons (fate#319698)
* Add skylake support (bnc#946734)
* Avoid warnings at boot up (bsc#920197)
* Knights Landing (fate#319507)
* Broadwell Ex and Ep (fate#319697)
* Different Skylake models (fate#319696)
-------------------------------------------------------------------
Fri Mar 3 09:26:33 UTC 2017 - mpluskal@suse.com
- Update to version 1.48
* Fix warning with gcc 6.x
* Remove obsolete TODO file
* Small fixes
- Use url for getting sources
-------------------------------------------------------------------
Fri Jan 13 15:20:55 UTC 2017 - felix.gerling@suse.com
- Version update to 1.47 (fate#321308, fate#320907, fate#321931):
* Fix PDF links
* Fix confusing error message
-------------------------------------------------------------------
Sat Dec 17 00:02:34 UTC 2016 - tchvatal@suse.com
- Version update to 1.46:
* Various cpu support for new machines
- Refresh patches:
* add-f10h-support.patch
* email.patch
- Force build with pic
- Use normal webpage as Url and do not point to git
- Fix build with --as-needed expanded Makefile patch for email.patch
-------------------------------------------------------------------
Fri May 6 16:08:48 UTC 2016 - trenn@suse.de
- Update to bugfix version 1.36
- Do not start mcelog service based on an udev (/dev/mcelog) rule (bsc#976781)
-------------------------------------------------------------------
Thu Jan 28 14:25:26 UTC 2016 - trenn@suse.de
- Update to latest version 1.29.
Mostly little bug fixes.
-------------------------------------------------------------------
Mon Sep 28 13:26:21 UTC 2015 - trenn@suse.de
- Update to version v124. Adds skylake CPU support and some bug fixes.
-------------------------------------------------------------------
Mon Jun 15 16:18:55 UTC 2015 - trenn@suse.de
- Update to latest v120 git tag and name the version 1.20:
New supported CPUs:
- Add model number for Broadwell-DE
- Added Knights Landing (Xeon Phi)
- Add all current Atom cpuids
- Support Broadwell-U
- New manpages: mcelog.conf.5 and mcelog.triggers.5
And quite some undocumented bugfixes, see git log for details
-------------------------------------------------------------------
Fri Jan 23 11:04:40 UTC 2015 - trenn@suse.de
- Update to version 1.0.8
- Remove patch which got integrated mainline:
0001-Continue-without-dmi-when-no-SMBIOS-or-SMBIOS-0x0-in.patch
- Fix possible security issue, build service complained about:
missing-call-to-setgroups-before-setuid
Add fix_setgroups_missing_call.patch
-------------------------------------------------------------------
Fri Nov 14 18:25:22 UTC 2014 - crrodriguez@opensuse.org
- While not yet defined, the tmpfiles_create macro takes
an argument for it to actually work
-------------------------------------------------------------------
Sat Sep 20 03:16:05 UTC 2014 - crrodriguez@opensuse.org
- mcelog.tmpfiles, mcelog-socket-path.patch, move socket
and pid file to /run/mcelog directory.
This update may require reboot as the relevant rpm macro
tmpfiles_create is not yet in any product.
-------------------------------------------------------------------
Wed Sep 3 15:41:05 UTC 2014 - trenn@suse.de
- Fixed the architecture tag to %{ix86} as suggested by:
Andreas Vetter <asvetter@cip.physik.uni-wuerzburg.de>
-------------------------------------------------------------------
Wed Sep 3 14:41:21 UTC 2014 - meissner@suse.com
- fixed the architecture tag to %ix86
-------------------------------------------------------------------
Fri Jul 11 08:17:28 UTC 2014 - juwolf@suse.com
- Fixed license, GPL-2.0
-------------------------------------------------------------------
Fri Jun 27 13:54:52 UTC 2014 - juwolf@suse.com
- Added: 0001-Continue-without-dmi-when-no-SMBIOS-or-SMBIOS-0x0-in.patch
Continue without dmi when no SMBIOS or SMBIOS=0x0 in /sys/firmware/efi/systab, bnc#829862
-------------------------------------------------------------------
Fri May 16 15:47:18 UTC 2014 - trenn@suse.de
- Add mce decoding support for latest AMD CPUs (bnc#871881).
- Implementation done by Borislav Petkov <bp@suse.de>
* Add patches/Start-consolidating-AMD-specific-stuff.patch
* Add add-defines.patch
* Add add-f10h-support.patch
* Add add-f11h-support.patch
* Add add-f12h-support.patch
* Add add-f14h-support.patch
* Add add-f15h-support.patch
* Add add-f16h-support.patch
-------------------------------------------------------------------
Mon Apr 28 16:49:38 UTC 2014 - trenn@suse.de
- Update to latest git tag v101.
- Mainline decided to finally do a version upgrade to v101
- Remove v1.1 again, obsolete it and go for version v102
- Some important fixes in the latest update:
- bnc#873159
- bnc#873725
-------------------------------------------------------------------
Sun Oct 27 18:00:04 UTC 2013 - crrodriguez@opensuse.org
- Cleanup spec file
- activate mcelog service via udev+systemd combo, if the kernel
registers a /dev/mcelog device the service will be automatically
started.
- drop sysvinit scripts, add appropiate %pre %post invocations
of the needed systemd macros.
-------------------------------------------------------------------
Tue Oct 15 17:02:13 UTC 2013 - trenn@suse.de
- Updated to latest git HEAD:
commit c7bf28088f056925c04d4fd5768504c59bbf19c4
Author: Robin Holt <robinmholt@gmail.com>
Date: Mon Sep 16 04:30:02 2013 -0500
Because upstream does not use proper tags/revisions, I now
versioned this one mcelog-1.1
-------------------------------------------------------------------
Fri Feb 22 13:00:21 UTC 2013 - rmilasan@suse.com
- Install mcelog.service accordingly (/usr/lib/systemd for 12.3
and up or /lib/systemd for older versions).
-------------------------------------------------------------------
Thu Aug 16 14:41:55 UTC 2012 - trenn@suse.de
- bnc#774226 mcelog + systemd: won't start without MCELOG_ADMIN_EMAIL set
- fix uninitialized variable mail_setup
-------------------------------------------------------------------
Wed Jul 18 12:49:43 UTC 2012 - trenn@suse.de
- Also build mcelog packages for i386 (bnc#770726)
-------------------------------------------------------------------
Tue Apr 24 23:43:56 YEKT 2012 - avm-xandry@yandex.ru
- Fixed description in init-file.
-------------------------------------------------------------------
Wed Nov 23 21:36:36 UTC 2011 - crrodriguez@opensuse.org
- Add systemd unit.
-------------------------------------------------------------------
Thu Aug 18 00:09:50 CEST 2011 - ro@suse.de
- update to GIT of today (6e4e2a000124f08f1a4e3791c2b02ec9ae6af393)
- many bugfixes
- Implement re-parsing of mcelog output in ASCII
- Add support for non-page aligned EFI Configuration Tables
- Add --debug-numerrors
- Add decoder for corrected XEN events to --ascii
- Correctly log kernel supplied time
- record the trigger info in the log
- mcelog: Implement dmi decoding for UEFI
- mcelog: Add usage information to mcelog for --ignorenodev
- Fix length calculation of SMBIOS mapping
- change disclaimer
- explictly spell out corrected errors
-------------------------------------------------------------------
Sat Jul 2 21:50:53 UTC 2011 - trenn@suse.de
- Update to latest git version (fate#311830)
Unfortunately versions have not been increased, latest tag
still is 1.0-pre3 (same as 1 year ago), therefore the date
is included in the version. I try to push maintainers to
increase the version number.
- Invert logic of db prefill messages -> info if it works, silent
if not
-------------------------------------------------------------------
Tue Jun 7 09:51:57 UTC 2011 - trenn@suse.de
- Remove test email address from config
-------------------------------------------------------------------
Mon Oct 25 15:48:57 CEST 2010 - trenn@suse.de
- Add Sandybridge/Westmere decode support
- Fix domainname for email notification
- Update to latest git version
-------------------------------------------------------------------
Tue Apr 6 15:15:45 CEST 2010 - trenn@suse.de
- Update to latest git version having quite some fixes (no features):
- Fixed some memleaks and made app valgrind conform
- Fixed theoretical DoS attack (bnc#586241)
- Added support of additional cpus
- Fixed a lot messages (in manpage, in triggers, in README, ...)
-------------------------------------------------------------------
Fri Feb 19 00:39:36 CET 2010 - ro@suse.de
- Update to version 1.0pre3
- Boxboro-EX enhancements
- Bugfixes
Minor pidfile handling adjusting in service file
- Added missing conf file and trigger scripts
-------------------------------------------------------------------
Thu Dec 3 12:12:40 CET 2009 - trenn@suse.de
- Minor .spec and init script fixes/cleanups
-------------------------------------------------------------------
Thu Dec 3 11:28:05 CET 2009 - trenn@suse.de
- Add service parts:
- let mcelog --daemon handle pid file in /var/run/mcelog.pid
- add insserv logic
- remove cron.daily script in update case
-------------------------------------------------------------------
Fri Nov 27 22:01:40 CET 2009 - trenn@suse.de
- Update to latest git version (called it 1.0pre1 myself, may differ
with a possible public 1.0pre1 version), this includes
following new featues:
- yellow bit support
- page predictive failure analysis support
- Initial memdb support
This allows to account memory errors in memory in daemon mode
And a lot more...
-------------------------------------------------------------------
Fri Oct 2 17:06:03 CEST 2009 - trenn@suse.de
- Update to latest git version (0.9pre)
Introduces mcelog damon mode, service file will follow in an
extra commit.
-------------------------------------------------------------------
Fri Jan 9 08:41:58 CET 2009 - olh@suse.de
- use ExclusiveArch as in /SRC/arch/
-------------------------------------------------------------------
Sat Sep 27 21:50:27 CEST 2008 - trenn@suse.de
- fate #304279 mcelog support for Tigerton/Dunnington
Patch is from Andi himself with this statement:
While it looks large most of it is just new tables.
-------------------------------------------------------------------
Mon May 29 16:23:12 CEST 2006 - ak@suse.de
- decode intel thermal events too (#179327)
-------------------------------------------------------------------
Fri May 5 19:00:23 CEST 2006 - ak@suse.de
- Update to 0.7. This fixes
- Fix --dmi option (#166324)
- Incorporate old patches
-------------------------------------------------------------------
Fri Mar 3 20:21:01 CET 2006 - ak@suse.de
- Avoid cosmetic problem in --filter (#153347)
-------------------------------------------------------------------
Wed Feb 8 14:58:50 CET 2006 - ak@suse.de
- update to mcelog 0.6
* Fixes bugs (#148869, #137985)
* Adds --dmi option to map addresses to DIMMs using SMBIOS
(default to off)
-------------------------------------------------------------------
Wed Jan 25 21:45:03 CET 2006 - mls@suse.de
- converted neededforbuild to BuildRequires
-------------------------------------------------------------------
Mon Dec 19 10:42:35 CET 2005 - sf@suse.de
- update to version 0.5
* Clarify --ascii in the manpage
*Support for AMD K8 Revision F machine check DRAM error
thresholding
-------------------------------------------------------------------
Fri Feb 11 10:39:53 CET 2005 - ak@suse.de
- Use RPM_OPT_FLAGS
- Improve description again
-------------------------------------------------------------------
Thu Feb 10 19:21:39 CET 2005 - ak@suse.de
- mcelog-0.4:
* add support to decode AMD K8 (Opteron/Athlon64/AthlonFX) and
Intel P4 (Xeon and Pentium 4) events
* add --ascii option to decode machine check panic information
- Rewrite description in .spec file
-------------------------------------------------------------------
Wed Jun 9 21:51:14 CEST 2004 - ak@suse.de
- memlog-0.2:
* fix mcelog looping (#41863)
* Add GPL notices
-------------------------------------------------------------------
Thu Mar 25 17:55:05 CET 2004 - sf@suse.de
- initial version
- fixes #36898

4
mcelog.obsinfo Normal file
View File

@ -0,0 +1,4 @@
name: mcelog
version: 196
mtime: 1698794375
commit: edfe78a0dc54a940f4916a9bd681eab7b3f746d1

118
mcelog.spec Normal file
View File

@ -0,0 +1,118 @@
#
# spec file for package mcelog
#
# Copyright (c) 2023 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
# upon. The license for this file, and modifications and additions to the
# file, is the same license as for the pristine package itself (unless the
# license for the pristine package is not an Open Source License, in which
# case the license is the MIT License). An "Open Source License" is a
# license that conforms to the Open Source Definition (Version 1.9)
# published by the Open Source Initiative.
# Please submit bugfixes or comments via https://bugs.opensuse.org/
#
#Compat macro for new _fillupdir macro introduced in Nov 2017
%if ! %{defined _fillupdir}
%define _fillupdir %{_localstatedir}/adm/fillup-templates
%endif
Name: mcelog
Version: 196
Release: 0
Summary: Log Machine Check Events
License: GPL-2.0-only
Group: System/Monitoring
URL: https://mcelog.org/
Source: mcelog-%{version}.tar.gz
Source2: mcelog.sysconfig
Source3: mcelog.systemd
Source5: mcelog.tmpfiles
Source6: README.email_setup
Patch1: email.patch
Patch2: mcelog_invert_prefill_db_warning.patch
Patch3: Start-consolidating-AMD-specific-stuff.patch
Patch4: add_new_amd_cpu_defines
Patch5: patches/add-f10h-support.patch
Patch6: patches/add-f11h-support.patch
Patch7: patches/add-f12h-support.patch
Patch8: patches/add-f14h-support.patch
Patch9: patches/add-f15h-support.patch
Patch10: patches/add-f16h-support.patch
Patch11: mcelog-socket-path.patch
Patch12: fix_setgroups_missing_call.patch
BuildRequires: libesmtp-devel
BuildRequires: pkgconfig
BuildRequires: pkgconfig(systemd)
Requires: logrotate
Requires(pre): %fillup_prereq
ExclusiveArch: %{ix86} x86_64
%{?systemd_requires}
%description
mcelog retrieves machine check events from an x86-64 kernel in a cron
job, decodes them, and logs them to %{_localstatedir}/log/mcelog.
A machine check event is a hardware error detected by the CPU.
It should run on any x86-64 system.
In addition, it allows decoding machine check kernel panic messages.
%prep
%autosetup
%build
echo "%{version}" > .os_version
%make_build CFLAGS="%{optflags} -fpie -pie"
%install
export prefix=%{buildroot}%{_prefix}
export etcprefix=%{buildroot}
make -e install
mkdir -p %{buildroot}%{_sysconfdir}/logrotate.d/
install -m644 mcelog.logrotate %{buildroot}%{_sysconfdir}/logrotate.d/mcelog
mkdir -p %{buildroot}%{_fillupdir}
install -m 644 %{SOURCE2} %{buildroot}%{_fillupdir}/sysconfig.mcelog
mkdir -p %{buildroot}/%{_docdir}/%{name}
install -m 644 %{SOURCE6} %{buildroot}/%{_docdir}/%{name}/README.email_setup
install -m 644 lk10-mcelog.pdf %{buildroot}/%{_docdir}/%{name}/lk10-mcelog.pdf
install -D -m 0644 %{SOURCE3} %{buildroot}%{_unitdir}/mcelog.service
install -D -m 0644 %{SOURCE5} %{buildroot}%{_tmpfilesdir}/mcelog.conf
ln -sf %{_sbindir}/service %{buildroot}%{_sbindir}/rcmcelog
%pre
%service_add_pre %{name}.service
%post
%fillup_only
%service_add_post %{name}.service
%{?tmpfiles_create:%tmpfiles_create %{_tmpfilesdir}/mcelog.conf}
%preun
%service_del_preun %{name}.service
%postun
%service_del_postun %{name}.service
%files
%defattr (-,root,root,755)
%{_mandir}/man8/*
%{_mandir}/man5/*
%{_sbindir}/mcelog
%config %{_sysconfdir}/logrotate.d/mcelog
%dir %{_sysconfdir}/mcelog
%config %{_sysconfdir}/mcelog/mcelog.conf
%{_fillupdir}/sysconfig.mcelog
%{_sysconfdir}/mcelog/*trigger
%{_unitdir}/mcelog.service
%{_tmpfilesdir}/mcelog.conf
%{_docdir}/%{name}
%{_sbindir}/rcmcelog
%ghost /run/mcelog
%changelog

10
mcelog.sysconfig Normal file
View File

@ -0,0 +1,10 @@
## Path: Hardware/machine_check
## Description: email address machine check exceptions are sent to
## Type: string
## Default: ""
#
# Machine check exceptions like memory (correctable or uncorrectable
# ECC errors), processor or other HW are sent with a detailed description
# to this address. Also read README.email_setup for further details.
#
MCELOG_ADMIN_EMAIL=""

13
mcelog.systemd Normal file
View File

@ -0,0 +1,13 @@
[Unit]
Description=Machine Check Exception Logging Daemon
ConditionVirtualization=false
ConditionPathExists=/dev/mcelog
[Service]
EnvironmentFile=-/etc/sysconfig/mcelog
ExecStartPre=/sbin/modprobe msr
ExecStartPre=/sbin/modprobe dmi-sysfs
ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground
[Install]
WantedBy=multi-user.target

1
mcelog.tmpfiles Normal file
View File

@ -0,0 +1 @@
d /run/mcelog 0755 root root -

View File

@ -0,0 +1,23 @@
---
memdb.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--- a/memdb.c
+++ b/memdb.c
@@ -431,11 +431,11 @@
md->location = xstrdup(bl);
md->name = xstrdup(dmi_getstring(&d->header, d->device_locator));
}
- if (missed) {
- static int warned;
- if (!warned) {
- Eprintf("failed to prefill DIMM database from DMI data");
- warned = 1;
+ if (!missed) {
+ static int db_rill_msg;
+ if (!db_rill_msg) {
+ Gprintf("Prefilled DIMM database from DMI data");
+ db_rill_msg = 1;
}
}
}