[PATCH 19/21] amd64_edac: add a leaner syndrome decoding algorithm

From: Borislav Petkov
Date: Mon Nov 16 2009 - 09:19:15 EST


Instead of using the whole syndrome tables for channel decoding, use a
set of eigenvectors with which the tables can be generated to search for
the syndrome in error. The algorithm operates independent of symbol size
and can be used for both x4 and x8 syndromes.

Signed-off-by: Borislav Petkov <borislav.petkov@xxxxxxx>
---
drivers/edac/amd64_edac.c | 280 +++++++++++++++++++++++++--------------------
1 files changed, 154 insertions(+), 126 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 351334e..0969a40 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -792,7 +792,7 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
return csrow;
}

-static int get_channel_from_ecc_syndrome(unsigned short syndrome);
+static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16);

static void amd64_cpu_display_info(struct amd64_pvt *pvt)
{
@@ -1113,7 +1113,7 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,

/* CHIPKILL enabled */
if (info->nbcfg & K8_NBCFG_CHIPKILL) {
- channel = get_channel_from_ecc_syndrome(syndrome);
+ channel = get_channel_from_ecc_syndrome(mci, syndrome);
if (channel < 0) {
/*
* Syndrome didn't map, so we don't know which of the
@@ -1672,7 +1672,7 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
* syndrome to isolate which channel the error was on.
*/
if (pvt->nbcfg & K8_NBCFG_CHIPKILL)
- chan = get_channel_from_ecc_syndrome(syndrome);
+ chan = get_channel_from_ecc_syndrome(mci, syndrome);

if (chan >= 0) {
edac_mc_handle_ce(mci, page, offset, syndrome,
@@ -1808,142 +1808,170 @@ static struct pci_dev *pci_get_related_function(unsigned int vendor,
}

/*
- * syndrome mapping table for ECC ChipKill devices
+ * These are tables of eigenvectors (one per line) which can be used for the
+ * construction of the syndrome tables. The modified syndrome search algorithm
+ * uses those to find the symbol in error and thus the DIMM.
*
- * The comment in each row is the token (nibble) number that is in error.
- * The least significant nibble of the syndrome is the mask for the bits
- * that are in error (need to be toggled) for the particular nibble.
- *
- * Each row contains 16 entries.
- * The first entry (0th) is the channel number for that row of syndromes.
- * The remaining 15 entries are the syndromes for the respective Error
- * bit mask index.
- *
- * 1st index entry is 0x0001 mask, indicating that the rightmost bit is the
- * bit in error.
- * The 2nd index entry is 0x0010 that the second bit is damaged.
- * The 3rd index entry is 0x0011 indicating that the rightmost 2 bits
- * are damaged.
- * Thus so on until index 15, 0x1111, whose entry has the syndrome
- * indicating that all 4 bits are damaged.
- *
- * A search is performed on this table looking for a given syndrome.
- *
- * See the AMD documentation for ECC syndromes. This ECC table is valid
- * across all the versions of the AMD64 processors.
- *
- * A fast lookup is to use the LAST four bits of the 16-bit syndrome as a
- * COLUMN index, then search all ROWS of that column, looking for a match
- * with the input syndrome. The ROW value will be the token number.
- *
- * The 0'th entry on that row, can be returned as the CHANNEL (0 or 1) of this
- * error.
+ * Algorithm courtesy of Ross LaFetra from AMD.
*/
-#define NUMBER_ECC_ROWS 36
-static const unsigned short ecc_chipkill_syndromes[NUMBER_ECC_ROWS][16] = {
- /* Channel 0 syndromes */
- {/*0*/ 0, 0xe821, 0x7c32, 0x9413, 0xbb44, 0x5365, 0xc776, 0x2f57,
- 0xdd88, 0x35a9, 0xa1ba, 0x499b, 0x66cc, 0x8eed, 0x1afe, 0xf2df },
- {/*1*/ 0, 0x5d31, 0xa612, 0xfb23, 0x9584, 0xc8b5, 0x3396, 0x6ea7,
- 0xeac8, 0xb7f9, 0x4cda, 0x11eb, 0x7f4c, 0x227d, 0xd95e, 0x846f },
- {/*2*/ 0, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
- 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f },
- {/*3*/ 0, 0x2021, 0x3032, 0x1013, 0x4044, 0x6065, 0x7076, 0x5057,
- 0x8088, 0xa0a9, 0xb0ba, 0x909b, 0xc0cc, 0xe0ed, 0xf0fe, 0xd0df },
- {/*4*/ 0, 0x5041, 0xa082, 0xf0c3, 0x9054, 0xc015, 0x30d6, 0x6097,
- 0xe0a8, 0xb0e9, 0x402a, 0x106b, 0x70fc, 0x20bd, 0xd07e, 0x803f },
- {/*5*/ 0, 0xbe21, 0xd732, 0x6913, 0x2144, 0x9f65, 0xf676, 0x4857,
- 0x3288, 0x8ca9, 0xe5ba, 0x5b9b, 0x13cc, 0xaded, 0xc4fe, 0x7adf },
- {/*6*/ 0, 0x4951, 0x8ea2, 0xc7f3, 0x5394, 0x1ac5, 0xdd36, 0x9467,
- 0xa1e8, 0xe8b9, 0x2f4a, 0x661b, 0xf27c, 0xbb2d, 0x7cde, 0x358f },
- {/*7*/ 0, 0x74e1, 0x9872, 0xec93, 0xd6b4, 0xa255, 0x4ec6, 0x3a27,
- 0x6bd8, 0x1f39, 0xf3aa, 0x874b, 0xbd6c, 0xc98d, 0x251e, 0x51ff },
- {/*8*/ 0, 0x15c1, 0x2a42, 0x3f83, 0xcef4, 0xdb35, 0xe4b6, 0xf177,
- 0x4758, 0x5299, 0x6d1a, 0x78db, 0x89ac, 0x9c6d, 0xa3ee, 0xb62f },
- {/*9*/ 0, 0x3d01, 0x1602, 0x2b03, 0x8504, 0xb805, 0x9306, 0xae07,
- 0xca08, 0xf709, 0xdc0a, 0xe10b, 0x4f0c, 0x720d, 0x590e, 0x640f },
- {/*a*/ 0, 0x9801, 0xec02, 0x7403, 0x6b04, 0xf305, 0x8706, 0x1f07,
- 0xbd08, 0x2509, 0x510a, 0xc90b, 0xd60c, 0x4e0d, 0x3a0e, 0xa20f },
- {/*b*/ 0, 0xd131, 0x6212, 0xb323, 0x3884, 0xe9b5, 0x5a96, 0x8ba7,
- 0x1cc8, 0xcdf9, 0x7eda, 0xafeb, 0x244c, 0xf57d, 0x465e, 0x976f },
- {/*c*/ 0, 0xe1d1, 0x7262, 0x93b3, 0xb834, 0x59e5, 0xca56, 0x2b87,
- 0xdc18, 0x3dc9, 0xae7a, 0x4fab, 0x542c, 0x85fd, 0x164e, 0xf79f },
- {/*d*/ 0, 0x6051, 0xb0a2, 0xd0f3, 0x1094, 0x70c5, 0xa036, 0xc067,
- 0x20e8, 0x40b9, 0x904a, 0x601b, 0x307c, 0x502d, 0x80de, 0xe08f },
- {/*e*/ 0, 0xa4c1, 0xf842, 0x5c83, 0xe6f4, 0x4235, 0x1eb6, 0xba77,
- 0x7b58, 0xdf99, 0x831a, 0x27db, 0x9dac, 0x396d, 0x65ee, 0xc12f },
- {/*f*/ 0, 0x11c1, 0x2242, 0x3383, 0xc8f4, 0xd935, 0xeab6, 0xfb77,
- 0x4c58, 0x5d99, 0x6e1a, 0x7fdb, 0x84ac, 0x956d, 0xa6ee, 0xb72f },
-
- /* Channel 1 syndromes */
- {/*10*/ 1, 0x45d1, 0x8a62, 0xcfb3, 0x5e34, 0x1be5, 0xd456, 0x9187,
- 0xa718, 0xe2c9, 0x2d7a, 0x68ab, 0xf92c, 0xbcfd, 0x734e, 0x369f },
- {/*11*/ 1, 0x63e1, 0xb172, 0xd293, 0x14b4, 0x7755, 0xa5c6, 0xc627,
- 0x28d8, 0x4b39, 0x99aa, 0xfa4b, 0x3c6c, 0x5f8d, 0x8d1e, 0xeeff },
- {/*12*/ 1, 0xb741, 0xd982, 0x6ec3, 0x2254, 0x9515, 0xfbd6, 0x4c97,
- 0x33a8, 0x84e9, 0xea2a, 0x5d6b, 0x11fc, 0xa6bd, 0xc87e, 0x7f3f },
- {/*13*/ 1, 0xdd41, 0x6682, 0xbbc3, 0x3554, 0xe815, 0x53d6, 0xce97,
- 0x1aa8, 0xc7e9, 0x7c2a, 0xa1fb, 0x2ffc, 0xf2bd, 0x497e, 0x943f },
- {/*14*/ 1, 0x2bd1, 0x3d62, 0x16b3, 0x4f34, 0x64e5, 0x7256, 0x5987,
- 0x8518, 0xaec9, 0xb87a, 0x93ab, 0xca2c, 0xe1fd, 0xf74e, 0xdc9f },
- {/*15*/ 1, 0x83c1, 0xc142, 0x4283, 0xa4f4, 0x2735, 0x65b6, 0xe677,
- 0xf858, 0x7b99, 0x391a, 0xbadb, 0x5cac, 0xdf6d, 0x9dee, 0x1e2f },
- {/*16*/ 1, 0x8fd1, 0xc562, 0x4ab3, 0xa934, 0x26e5, 0x6c56, 0xe387,
- 0xfe18, 0x71c9, 0x3b7a, 0xb4ab, 0x572c, 0xd8fd, 0x924e, 0x1d9f },
- {/*17*/ 1, 0x4791, 0x89e2, 0xce73, 0x5264, 0x15f5, 0xdb86, 0x9c17,
- 0xa3b8, 0xe429, 0x2a5a, 0x6dcb, 0xf1dc, 0xb64d, 0x783e, 0x3faf },
- {/*18*/ 1, 0x5781, 0xa9c2, 0xfe43, 0x92a4, 0xc525, 0x3b66, 0x6ce7,
- 0xe3f8, 0xb479, 0x4a3a, 0x1dbb, 0x715c, 0x26dd, 0xd89e, 0x8f1f },
- {/*19*/ 1, 0xbf41, 0xd582, 0x6ac3, 0x2954, 0x9615, 0xfcd6, 0x4397,
- 0x3ea8, 0x81e9, 0xeb2a, 0x546b, 0x17fc, 0xa8bd, 0xc27e, 0x7d3f },
- {/*1a*/ 1, 0x9891, 0xe1e2, 0x7273, 0x6464, 0xf7f5, 0x8586, 0x1617,
- 0xb8b8, 0x2b29, 0x595a, 0xcacb, 0xdcdc, 0x4f4d, 0x3d3e, 0xaeaf },
- {/*1b*/ 1, 0xcce1, 0x4472, 0x8893, 0xfdb4, 0x3f55, 0xb9c6, 0x7527,
- 0x56d8, 0x9a39, 0x12aa, 0xde4b, 0xab6c, 0x678d, 0xef1e, 0x23ff },
- {/*1c*/ 1, 0xa761, 0xf9b2, 0x5ed3, 0xe214, 0x4575, 0x1ba6, 0xbcc7,
- 0x7328, 0xd449, 0x8a9a, 0x2dfb, 0x913c, 0x365d, 0x688e, 0xcfef },
- {/*1d*/ 1, 0xff61, 0x55b2, 0xaad3, 0x7914, 0x8675, 0x2ca6, 0xd3c7,
- 0x9e28, 0x6149, 0xcb9a, 0x34fb, 0xe73c, 0x185d, 0xb28e, 0x4def },
- {/*1e*/ 1, 0x5451, 0xa8a2, 0xfcf3, 0x9694, 0xc2c5, 0x3e36, 0x6a67,
- 0xebe8, 0xbfb9, 0x434a, 0x171b, 0x7d7c, 0x292d, 0xd5de, 0x818f },
- {/*1f*/ 1, 0x6fc1, 0xb542, 0xda83, 0x19f4, 0x7635, 0xacb6, 0xc377,
- 0x2e58, 0x4199, 0x9b1a, 0xf4db, 0x37ac, 0x586d, 0x82ee, 0xed2f },
-
- /* ECC bits are also in the set of tokens and they too can go bad
- * first 2 cover channel 0, while the second 2 cover channel 1
- */
- {/*20*/ 0, 0xbe01, 0xd702, 0x6903, 0x2104, 0x9f05, 0xf606, 0x4807,
- 0x3208, 0x8c09, 0xe50a, 0x5b0b, 0x130c, 0xad0d, 0xc40e, 0x7a0f },
- {/*21*/ 0, 0x4101, 0x8202, 0xc303, 0x5804, 0x1905, 0xda06, 0x9b07,
- 0xac08, 0xed09, 0x2e0a, 0x6f0b, 0x640c, 0xb50d, 0x760e, 0x370f },
- {/*22*/ 1, 0xc441, 0x4882, 0x8cc3, 0xf654, 0x3215, 0xbed6, 0x7a97,
- 0x5ba8, 0x9fe9, 0x132a, 0xd76b, 0xadfc, 0x69bd, 0xe57e, 0x213f },
- {/*23*/ 1, 0x7621, 0x9b32, 0xed13, 0xda44, 0xac65, 0x4176, 0x3757,
- 0x6f88, 0x19a9, 0xf4ba, 0x829b, 0xb5cc, 0xc3ed, 0x2efe, 0x58df }
+static u16 x4_vectors[] = {
+ 0x2f57, 0x1afe, 0x66cc, 0xdd88,
+ 0x11eb, 0x3396, 0x7f4c, 0xeac8,
+ 0x0001, 0x0002, 0x0004, 0x0008,
+ 0x1013, 0x3032, 0x4044, 0x8088,
+ 0x106b, 0x30d6, 0x70fc, 0xe0a8,
+ 0x4857, 0xc4fe, 0x13cc, 0x3288,
+ 0x1ac5, 0x2f4a, 0x5394, 0xa1e8,
+ 0x1f39, 0x251e, 0xbd6c, 0x6bd8,
+ 0x15c1, 0x2a42, 0x89ac, 0x4758,
+ 0x2b03, 0x1602, 0x4f0c, 0xca08,
+ 0x1f07, 0x3a0e, 0x6b04, 0xbd08,
+ 0x8ba7, 0x465e, 0x244c, 0x1cc8,
+ 0x2b87, 0x164e, 0x642c, 0xdc18,
+ 0x40b9, 0x80de, 0x1094, 0x20e8,
+ 0x27db, 0x1eb6, 0x9dac, 0x7b58,
+ 0x11c1, 0x2242, 0x84ac, 0x4c58,
+ 0x1be5, 0x2d7a, 0x5e34, 0xa718,
+ 0x4b39, 0x8d1e, 0x14b4, 0x28d8,
+ 0x4c97, 0xc87e, 0x11fc, 0x33a8,
+ 0x8e97, 0x497e, 0x2ffc, 0x1aa8,
+ 0x16b3, 0x3d62, 0x4f34, 0x8518,
+ 0x1e2f, 0x391a, 0x5cac, 0xf858,
+ 0x1d9f, 0x3b7a, 0x572c, 0xfe18,
+ 0x15f5, 0x2a5a, 0x5264, 0xa3b8,
+ 0x1dbb, 0x3b66, 0x715c, 0xe3f8,
+ 0x4397, 0xc27e, 0x17fc, 0x3ea8,
+ 0x1617, 0x3d3e, 0x6464, 0xb8b8,
+ 0x23ff, 0x12aa, 0xab6c, 0x56d8,
+ 0x2dfb, 0x1ba6, 0x913c, 0x7328,
+ 0x185d, 0x2ca6, 0x7914, 0x9e28,
+ 0x171b, 0x3e36, 0x7d7c, 0xebe8,
+ 0x4199, 0x82ee, 0x19f4, 0x2e58,
+ 0x4807, 0xc40e, 0x130c, 0x3208,
+ 0x1905, 0x2e0a, 0x5804, 0xac08,
+ 0x213f, 0x132a, 0xadfc, 0x5ba8,
+ 0x19a9, 0x2efe, 0xb5cc, 0x6f88,
};

-/*
- * Given the syndrome argument, scan each of the channel tables for a syndrome
- * match. Depending on which table it is found, return the channel number.
- */
-static int get_channel_from_ecc_syndrome(unsigned short syndrome)
+static u16 x8_vectors[] = {
+ 0x0145, 0x028a, 0x2374, 0x43c8, 0xa1f0, 0x0520, 0x0a40, 0x1480,
+ 0x0211, 0x0422, 0x0844, 0x1088, 0x01b0, 0x44e0, 0x23c0, 0xed80,
+ 0x1011, 0x0116, 0x022c, 0x0458, 0x08b0, 0x8c60, 0x2740, 0x4e80,
+ 0x0411, 0x0822, 0x1044, 0x0158, 0x02b0, 0x2360, 0x46c0, 0xab80,
+ 0x0811, 0x1022, 0x012c, 0x0258, 0x04b0, 0x4660, 0x8cc0, 0x2780,
+ 0x2071, 0x40e2, 0xa0c4, 0x0108, 0x0210, 0x0420, 0x0840, 0x1080,
+ 0x4071, 0x80e2, 0x0104, 0x0208, 0x0410, 0x0820, 0x1040, 0x2080,
+ 0x8071, 0x0102, 0x0204, 0x0408, 0x0810, 0x1020, 0x2040, 0x4080,
+ 0x019d, 0x03d6, 0x136c, 0x2198, 0x50b0, 0xb2e0, 0x0740, 0x0e80,
+ 0x0189, 0x03ea, 0x072c, 0x0e58, 0x1cb0, 0x56e0, 0x37c0, 0xf580,
+ 0x01fd, 0x0376, 0x06ec, 0x0bb8, 0x1110, 0x2220, 0x4440, 0x8880,
+ 0x0163, 0x02c6, 0x1104, 0x0758, 0x0eb0, 0x2be0, 0x6140, 0xc280,
+ 0x02fd, 0x01c6, 0x0b5c, 0x1108, 0x07b0, 0x25a0, 0x8840, 0x6180,
+ 0x0801, 0x012e, 0x025c, 0x04b8, 0x1370, 0x26e0, 0x57c0, 0xb580,
+ 0x0401, 0x0802, 0x015c, 0x02b8, 0x22b0, 0x13e0, 0x7140, 0xe280,
+ 0x0201, 0x0402, 0x0804, 0x01b8, 0x11b0, 0x31a0, 0x8040, 0x7180,
+ 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080,
+ 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+ 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000,
+};
+
+static int decode_syndrome(u16 syndrome, u16 *vectors, int num_vecs,
+ int v_dim)
{
- int row;
- int column;
+ unsigned int i, err_sym;
+
+ for (err_sym = 0; err_sym < num_vecs / v_dim; err_sym++) {
+ u16 s = syndrome;
+ int v_idx = err_sym * v_dim;
+ int v_end = (err_sym + 1) * v_dim;
+
+ /* walk over all 16 bits of the syndrome */
+ for (i = 1; i < (1U << 16); i <<= 1) {
+
+ /* if bit is set in that eigenvector... */
+ if (v_idx < v_end && vectors[v_idx] & i) {
+ u16 ev_comp = vectors[v_idx++];
+
+ /* ... and bit set in the modified syndrome, */
+ if (s & i) {
+ /* remove it. */
+ s ^= ev_comp;

- /* Determine column to scan */
- column = syndrome & 0xF;
+ if (!s)
+ return err_sym;
+ }

- /* Scan all rows, looking for syndrome, or end of table */
- for (row = 0; row < NUMBER_ECC_ROWS; row++) {
- if (ecc_chipkill_syndromes[row][column] == syndrome)
- return ecc_chipkill_syndromes[row][0];
+ } else if (s & i)
+ /* can't get to zero, move to next symbol */
+ break;
+ }
}

debugf0("syndrome(%x) not found\n", syndrome);
return -1;
}

+static int map_err_sym_to_channel(int err_sym, int sym_size)
+{
+ if (sym_size == 4)
+ switch (err_sym) {
+ case 0x20:
+ case 0x21:
+ return 0;
+ break;
+ case 0x22:
+ case 0x23:
+ return 1;
+ break;
+ default:
+ return err_sym >> 4;
+ break;
+ }
+ /* x8 symbols */
+ else
+ switch (err_sym) {
+ /* imaginary bits not in a DIMM */
+ case 0x10:
+ WARN(1, KERN_ERR "Invalid error symbol: 0x%x\n",
+ err_sym);
+ return -1;
+ break;
+
+ case 0x11:
+ return 0;
+ break;
+ case 0x12:
+ return 1;
+ break;
+ default:
+ return err_sym >> 3;
+ break;
+ }
+ return -1;
+}
+
+static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
+{
+ struct amd64_pvt *pvt = mci->pvt_info;
+ u32 value = 0;
+ int err_sym = 0;
+
+ amd64_read_pci_cfg(pvt->misc_f3_ctl, 0x180, &value);
+
+ /* F3x180[EccSymbolSize]=1, x8 symbols */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model > 7 &&
+ value & BIT(25)) {
+ err_sym = decode_syndrome(syndrome, x8_vectors,
+ ARRAY_SIZE(x8_vectors), 8);
+ return map_err_sym_to_channel(err_sym, 8);
+ } else {
+ err_sym = decode_syndrome(syndrome, x4_vectors,
+ ARRAY_SIZE(x4_vectors), 4);
+ return map_err_sym_to_channel(err_sym, 4);
+ }
+}
+
/*
* Check for valid error in the NB Status High register. If so, proceed to read
* NB Status Low, NB Address Low and NB Address High registers and store data
--
1.6.4.3


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/