[PATCH 3/3] x86/MCE/AMD, EDAC/mce_amd: Support non-uniform MCA bank type enumeration

From: Yazen Ghannam
Date: Thu Dec 02 2021 - 21:00:51 EST


AMD systems currently lay out MCA bank types such that the type of bank
number "i" is either the same across all CPUs or is Reserved/Read-as-Zero.

For example:

Bank # | CPUx | CPUy
0 LS LS
1 RAZ UMC
2 CS CS
3 SMU RAZ

Future AMD systems will lay out MCA bank types such that the type of
bank number "i" may be different across CPUs.

For example:

Bank # | CPUx | CPUy
0 LS LS
1 RAZ UMC
2 CS NBIO
3 SMU RAZ

Change the structures that cache MCA bank types to be per-CPU and update
smca_get_bank_type() to handle this change.

Move some SMCA-specific structures to amd.c from mce.h, since they no
longer need to be global.

Break out the "count" for bank types from struct smca_hwid, since this
should provide a per-CPU count rather than a system-wide count.

Apply the "const" qualifier to the struct smca_hwid_mcatypes array. The
values in this array should not change at runtime.

Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
arch/x86/include/asm/mce.h | 18 +-------
arch/x86/kernel/cpu/mce/amd.c | 60 ++++++++++++++-----------
drivers/edac/mce_amd.c | 12 +----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
4 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index d6834e8fbb6a..a0b3cdd8e6ec 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -330,22 +330,6 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};

-#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-
-struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
- u32 hwid_mcatype; /* (hwid,mcatype) tuple */
- u8 count; /* Number of instances. */
-};
-
-struct smca_bank {
- struct smca_hwid *hwid;
- u32 id; /* Value of MCA_IPID[InstanceId]. */
- u8 sysfs_id; /* Value used for sysfs name. */
-};
-
-extern struct smca_bank smca_banks[MAX_NR_BANKS];
-
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);

@@ -353,7 +337,7 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);

void mce_amd_feature_init(struct cpuinfo_x86 *c);
-enum smca_bank_types smca_get_bank_type(unsigned int bank);
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e12a8f3414f5..6ec87e580145 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};

+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
@@ -127,14 +143,14 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);

-enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;

if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;

- b = &smca_banks[bank];
+ b = &per_cpu(smca_banks, cpu)[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;

@@ -142,7 +158,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank)
}
EXPORT_SYMBOL_GPL(smca_get_bank_type);

-static struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */

/* Reserved type */
@@ -223,9 +239,6 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_UNKNOWN, HWID_MCATYPE(0xFFF, 0xFFFF) },
};

-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -281,8 +294,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)

static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
- struct smca_hwid *s_hwid;
u32 high, low;
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);

@@ -318,10 +332,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)

smca_set_misc_banks_map(bank, cpu);

- /* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
- return;
-
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
@@ -335,9 +345,9 @@ static void smca_configure(unsigned int bank, unsigned int cpu)

if (hwid_mcatype == s_hwid->hwid_mcatype ||
s_hwid->bank_type == SMCA_UNKNOWN) {
- smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = low;
- smca_banks[bank].sysfs_id = s_hwid->count++;
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
break;
}
}
@@ -623,7 +633,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,

bool amd_filter_mce(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;

/* See Family 17h Models 10h-2Fh Erratum #1114. */
@@ -661,7 +671,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {

- if (smca_get_bank_type(bank) != SMCA_IF)
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
return;

msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
@@ -729,7 +739,7 @@ bool amd_mce_is_memory_error(struct mce *m)
u8 xec = (m->status >> 16) & 0x1f;

if (mce_flags.smca)
- return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;

return m->bank == 4 && xec == 0x8;
}
@@ -1045,7 +1055,7 @@ static struct kobj_type threshold_ktype = {
.release = threshold_block_release,
};

-static const char *get_name(unsigned int bank, struct threshold_block *b)
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
{
enum smca_bank_types bank_type;

@@ -1056,7 +1066,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}

- bank_type = smca_get_bank_type(bank);
+ bank_type = smca_get_bank_type(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;

@@ -1066,11 +1076,11 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}

- if (smca_banks[bank].hwid->count == 1) {
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) {
if (bank_type == SMCA_UNKNOWN) {
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
"%s_%x", smca_get_name(bank_type),
- smca_banks[bank].id);
+ per_cpu(smca_banks, cpu)[bank].id);

return buf_mcatype;
} else {
@@ -1081,11 +1091,11 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
if (b && bank_type == SMCA_UNKNOWN) {
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
"%s_%x_block_%u", smca_get_name(bank_type),
- smca_banks[bank].id, b->block);
+ per_cpu(smca_banks, cpu)[bank].id, b->block);
} else {
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
"%s_%u", smca_get_name(bank_type),
- smca_banks[bank].sysfs_id);
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
}

return buf_mcatype;
@@ -1143,7 +1153,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
else
tb->blocks = b;

- err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
recurse:
@@ -1198,7 +1208,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = get_name(bank, NULL);
+ const char *name = get_name(cpu, bank, NULL);
int err = 0;

if (!dev)
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 720df7b4d6ab..39ffde96a343 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1166,20 +1166,10 @@ static void decode_mc6_mce(struct mce *m)
/* Decode errors according to Scalable MCA specification */
static void decode_smca_error(struct mce *m)
{
- struct smca_hwid *hwid;
- enum smca_bank_types bank_type;
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
const char *ip_name;
u8 xec = XEC(m->status, xec_mask);

- if (m->bank >= ARRAY_SIZE(smca_banks))
- return;
-
- hwid = smca_banks[m->bank].hwid;
- if (!hwid)
- return;
-
- bank_type = hwid->bank_type;
-
if (bank_type == SMCA_RESERVED) {
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 08133de21fdd..75dad0214dc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2647,7 +2647,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
* and error occurred in DramECC (Extended error code = 0) then only
* process the error, else bail out.
*/
- if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+ if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
(XEC(m->status, 0x3f) == 0x0)))
return NOTIFY_DONE;

--
2.25.1