[PATCH 7/7] EDAC/amd64: RAS: platform/x86/amd: Identify all physical pages in row

From: Muralidhara M K
Date: Wed Oct 25 2023 - 03:38:17 EST


From: Muralidhara M K <muralidhara.mk@xxxxxxx>

AMD systems have HBM memory embedded with the chips, The entire memory
is managed by host OS. Error containment needs to be reliable, because
HBM memory cannot be replaced.

Persist all UMC DRAM ECC errors, the OS can make the bad or poisoned page
state persistent so that it will not use the memory upon the next boot.

The reported MCA error address in HBM in the format PC/SID/Bank/ROW/COL
For example, In MI300A C1/C0 (column bits 1-0) is at SPA bit 6-5. Assuming
PFN only looks at SPA bit 12 or higher, column bits 1-0 could be skipped.
For PFN, SPA bits higher or equal than 12 matters. So column bits c2, c3
and c4 gives 8 possible combination of addresses in a row.

So, Identify all physical pages in a HBM row and retire all the pages
to get rid of intermittent or recurrent memory errors.

Signed-off-by: Muralidhara M K <muralidhara.mk@xxxxxxx>
---
drivers/edac/amd64_edac.c | 5 ++
drivers/ras/amd/atl/umc.c | 103 ++++++++++++++++++++++++++++++++++++++
include/linux/amd-atl.h | 2 +
3 files changed, 110 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 79c6c552ee14..d0db11e19a46 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2838,6 +2838,11 @@ static void decode_umc_error(int node_id, struct mce *m)

error_address_to_page_and_offset(sys_addr, &err);

+ if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f)) {
+ if (identify_poison_pages_retire_row(m))
+ return;
+ }
+
log_error:
__log_ecc_error(mci, &err, ecc_type);
}
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 52247a7949fb..d31ad7680ff1 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -255,3 +255,106 @@ int umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return 0;
}
EXPORT_SYMBOL_GPL(umc_mca_addr_to_sys_addr);
+
+/*
+ * High Bandwidth Memory (HBM v3) has fixed number of columns in a
+ * row (8 columns in one HBM row).
+ * Extract column bits to find all the combination of masks to retire
+ * all the poison pages in a row.
+ */
+#define MAX_COLUMNS_IN_HBM_ROW 8
+
+/* The C2 bit in CH NA address */
+#define UMC_NA_C2_BIT BIT(8)
+/* The C3 bit in CH NA address */
+#define UMC_NA_C3_BIT BIT(9)
+/* The C4 bit in CH NA address */
+#define UMC_NA_C4_BIT BIT(14)
+
+/* masks to get all possible combinations of column addresses */
+#define C_1_1_1_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_1_1_0_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT)
+#define C_1_0_1_MASK (UMC_NA_C4_BIT | UMC_NA_C2_BIT)
+#define C_1_0_0_MASK (UMC_NA_C4_BIT)
+#define C_0_1_1_MASK (UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_0_1_0_MASK (UMC_NA_C3_BIT)
+#define C_0_0_1_MASK (UMC_NA_C2_BIT)
+#define C_0_0_0_MASK ~C_1_1_1_MASK
+
+/* Identify all combination of column address physical pages in a row */
+static int amd_umc_identify_pages_in_row(struct mce *m, u64 *spa_addr)
+{
+ u8 cs_inst_id = get_cs_inst_id(m);
+ u8 socket_id = get_socket_id(m);
+ u64 norm_addr = get_norm_addr(m);
+ u8 die_id = get_die_id(m);
+ u16 df_acc_id = get_df_acc_id(m);
+
+ u64 retire_addr, column;
+ u64 column_masks[] = { 0, C_0_0_1_MASK, C_0_1_0_MASK, C_0_1_1_MASK,
+ C_1_0_0_MASK, C_1_0_1_MASK, C_1_1_0_MASK, C_1_1_1_MASK };
+
+ /* clear and loop for all possibilities of [c4 c3 c2] */
+ norm_addr &= C_0_0_0_MASK;
+
+ for (column = 0; column < ARRAY_SIZE(column_masks); column++) {
+ retire_addr = norm_addr | column_masks[column];
+
+ if (norm_to_sys_addr(df_acc_id, socket_id, die_id, cs_inst_id, &retire_addr))
+ return -EINVAL;
+ *(spa_addr + column) = retire_addr;
+ }
+
+ return 0;
+}
+
+/* Find any duplicate addresses in all combination of column address */
+static void amd_umc_find_duplicate_spa(u64 arr[], int *size)
+{
+ int i, j, k;
+
+ /* use nested for loop to find the duplicate elements in array */
+ for (i = 0; i < *size; i++) {
+ for (j = i + 1; j < *size; j++) {
+ /* check duplicate element */
+ if (arr[i] == arr[j]) {
+ /* delete the current position of the duplicate element */
+ for (k = j; k < (*size - 1); k++)
+ arr[k] = arr[k + 1];
+
+ /* decrease the size of array after removing duplicate element */
+ (*size)--;
+
+ /* if the position of the elements is changes, don't increase index j */
+ j--;
+ }
+ }
+ }
+}
+
+int identify_poison_pages_retire_row(struct mce *m)
+{
+ int i, ret, addr_range;
+ unsigned long pfn;
+ u64 col[MAX_COLUMNS_IN_HBM_ROW];
+ u64 *spa_addr = col;
+
+ /* Identify all pages in a row */
+ pr_info("Identify all physical Pages in a row for MCE addr:0x%llx\n", m->addr);
+ ret = amd_umc_identify_pages_in_row(m, spa_addr);
+ if (!ret) {
+ for (i = 0; i < MAX_COLUMNS_IN_HBM_ROW; i++)
+ pr_info("col[%d]_addr:0x%llx ", i, spa_addr[i]);
+ }
+ /* Find duplicate entries from all 8 physical addresses in a row */
+ addr_range = ARRAY_SIZE(col);
+ amd_umc_find_duplicate_spa(spa_addr, &addr_range);
+ /* do page retirement on all system physical addresses */
+ for (i = 0; i < addr_range; i++) {
+ pfn = PHYS_PFN(spa_addr[i]);
+ memory_failure(pfn, 0);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(identify_poison_pages_retire_row);
diff --git a/include/linux/amd-atl.h b/include/linux/amd-atl.h
index c625ea3ab5d0..df24ae592c4e 100644
--- a/include/linux/amd-atl.h
+++ b/include/linux/amd-atl.h
@@ -25,4 +25,6 @@ static inline int amd_umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return umc_mca_addr_to_sys_addr(m, sys_addr);
}

+int identify_poison_pages_retire_row(struct mce *m);
+
#endif /* _AMD_ATL_H */
--
2.25.1