Re: [PATCH 06/19] EDAC, mc: Remove per layer counters

From: Mauro Carvalho Chehab
Date: Fri Oct 11 2019 - 06:40:43 EST


Em Thu, 10 Oct 2019 20:25:16 +0000
Robert Richter <rrichter@xxxxxxxxxxx> escreveu:

> Looking at how mci->{ue,ce}_per_layer[EDAC_MAX_LAYERS] is used, it
> turns out that only the leaves in the memory hierarchy are consumed
> (in sysfs), but not the intermediate layers, e.g.:
>
> count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
>
> These unused counters only add complexity, remove them. The error
> counter values are directly stored in struct dimm_info now.

Hmm... not sure if this patch is correct. I remember that there are some
border cases on some drivers (maybe the 3-layer drivers used together
with RDIMM memory controllers?) where some errors are not associated
to an specific dimm, but, instead, are related to a problem at the memory
bus.

Also, depending on how the memory controllers are organized[1], the ECC
logic groups memory on DIMM pairs. So, when an error occur, it may be
either at DIMM1 or DIMM2.

[1] On Intel, this happens with pre-Nehalem memory controllers.

Due to that, storing errors at the dimm struct sounds wrong, as the
error may affect multiple DIMMs or even the entire layer.

>
> Signed-off-by: Robert Richter <rrichter@xxxxxxxxxxx>
> ---
> drivers/edac/edac_mc.c | 104 ++++++++++++-----------------------
> drivers/edac/edac_mc_sysfs.c | 20 +++----
> drivers/edac/ghes_edac.c | 5 +-
> include/linux/edac.h | 7 +--
> 4 files changed, 46 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
> index 45b02bb31964..c1e142643006 100644
> --- a/drivers/edac/edac_mc.c
> +++ b/drivers/edac/edac_mc.c
> @@ -315,10 +315,9 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> struct csrow_info *csr;
> struct rank_info *chan;
> struct dimm_info *dimm;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
> unsigned int pos[EDAC_MAX_LAYERS];
> - unsigned int idx, size, tot_dimms = 1, count = 1;
> - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
> + unsigned int idx, size, tot_dimms = 1;
> + unsigned int tot_csrows = 1, tot_channels = 1;
> void *pvt, *p, *ptr = NULL;
> int i, j, row, chn, n, len;
> bool per_rank = false;
> @@ -346,19 +345,10 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * stringent as what the compiler would provide if we could simply
> * hardcode everything into a single struct.
> */
> - mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> - for (i = 0; i < n_layers; i++) {
> - count *= layers[i].size;
> - edac_dbg(4, "errcount layer %d size %d\n", i, count);
> - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
> - tot_errcount += 2 * count;
> - }
> -
> - edac_dbg(4, "allocating %d error counters\n", tot_errcount);
> - pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> - size = ((unsigned long)pvt) + sz_pvt;
> + mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
> + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
> + pvt = edac_align_ptr(&ptr, sz_pvt, 1);
> + size = ((unsigned long)pvt) + sz_pvt;
>
> edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
> size,
> @@ -374,10 +364,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num,
> * rather than an imaginary chunk of memory located at address 0.
> */
> layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
> - for (i = 0; i < n_layers; i++) {
> - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
> - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
> - }
> pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
>
> /* setup index and various internal pointers */
> @@ -908,53 +894,31 @@ const char *edac_layer_name[] = {
> EXPORT_SYMBOL_GPL(edac_layer_name);
>
> static void edac_inc_ce_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ce_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ce_count += count;
> + else
> mci->ce_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ce_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_inc_ue_error(struct mem_ctl_info *mci,
> - bool enable_per_layer_report,
> const int pos[EDAC_MAX_LAYERS],
> const u16 count)
> {
> - int i, index = 0;
> + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]);
>
> mci->ue_mc += count;
>
> - if (!enable_per_layer_report) {
> + if (dimm)
> + dimm->ue_count += count;
> + else
> mci->ue_noinfo_count += count;
> - return;
> - }
> -
> - for (i = 0; i < mci->n_layers; i++) {
> - if (pos[i] < 0)
> - break;
> - index += pos[i];
> - mci->ue_per_layer[i][index] += count;
> -
> - if (i < mci->n_layers - 1)
> - index *= mci->layers[i + 1].size;
> - }
> }
>
> static void edac_ce_error(struct mem_ctl_info *mci,
> @@ -965,7 +929,6 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> const char *label,
> const char *detail,
> const char *other_detail,
> - const bool enable_per_layer_report,
> const unsigned long page_frame_number,
> const unsigned long offset_in_page,
> long grain)
> @@ -988,7 +951,7 @@ static void edac_ce_error(struct mem_ctl_info *mci,
> error_count, msg, msg_aux, label,
> location, detail);
> }
> - edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ce_error(mci, pos, error_count);
>
> if (mci->scrub_mode == SCRUB_SW_SRC) {
> /*
> @@ -1018,8 +981,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> const char *location,
> const char *label,
> const char *detail,
> - const char *other_detail,
> - const bool enable_per_layer_report)
> + const char *other_detail)
> {
> char *msg_aux = "";
>
> @@ -1048,7 +1010,7 @@ static void edac_ue_error(struct mem_ctl_info *mci,
> msg, msg_aux, label, location, detail);
> }
>
> - edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
> + edac_inc_ue_error(mci, pos, error_count);
> }
>
> void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> @@ -1064,16 +1026,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
> "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
> e->page_frame_number, e->offset_in_page,
> e->grain, e->syndrome);
> - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report,
> + edac_ce_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail,
> e->page_frame_number, e->offset_in_page, e->grain);
> } else {
> snprintf(detail, sizeof(detail),
> "page:0x%lx offset:0x%lx grain:%ld",
> e->page_frame_number, e->offset_in_page, e->grain);
>
> - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
> - detail, e->other_detail, e->enable_per_layer_report);
> + edac_ue_error(mci, e->error_count, pos, e->msg, e->location,
> + e->label, detail, e->other_detail);
> }
>
>
> @@ -1099,6 +1061,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> int i, n_labels = 0;
> u8 grain_bits;
> struct edac_raw_error_desc *e = &mci->error_desc;
> + bool any_memory = true;
>
> edac_dbg(3, "MC%d\n", mci->mc_idx);
>
> @@ -1116,9 +1079,9 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
>
> /*
> * Check if the event report is consistent and if the memory
> - * location is known. If it is known, enable_per_layer_report will be
> - * true, the DIMM(s) label info will be filled and the per-layer
> - * error counters will be incremented.
> + * location is known. If it is known, the DIMM(s) label info
> + * will be filled and the DIMM's error counters will be
> + * incremented.
> */
> for (i = 0; i < mci->n_layers; i++) {
> if (pos[i] >= (int)mci->layers[i].size) {
> @@ -1136,7 +1099,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> pos[i] = -1;
> }
> if (pos[i] >= 0)
> - e->enable_per_layer_report = true;
> + any_memory = false;
> }
>
> /*
> @@ -1166,16 +1129,17 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> e->grain = dimm->grain;
>
> /*
> - * If the error is memory-controller wide, there's no need to
> - * seek for the affected DIMMs because the whole
> - * channel/memory controller/... may be affected.
> - * Also, don't show errors for empty DIMM slots.
> + * If the error is memory-controller wide, there's no
> + * need to seek for the affected DIMMs because the
> + * whole channel/memory controller/... may be
> + * affected. Also, don't show errors for empty DIMM
> + * slots.
> */
> - if (!e->enable_per_layer_report || !dimm->nr_pages)
> + if (any_memory || !dimm->nr_pages)
> continue;
>
> if (n_labels >= EDAC_MAX_LABELS) {
> - e->enable_per_layer_report = false;
> + any_memory = true;
> break;
> }
> n_labels++;
> @@ -1205,7 +1169,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
> chan = -2;
> }
>
> - if (!e->enable_per_layer_report) {
> + if (any_memory) {
> strcpy(e->label, "any memory");
> } else {
> edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
> diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
> index 0367554e7437..8682df2f7f4f 100644
> --- a/drivers/edac/edac_mc_sysfs.c
> +++ b/drivers/edac/edac_mc_sysfs.c
> @@ -556,10 +556,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ce_count);
> }
>
> static ssize_t dimmdev_ue_count_show(struct device *dev,
> @@ -567,10 +565,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
> char *data)
> {
> struct dimm_info *dimm = to_dimm(dev);
> - u32 count;
>
> - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
> - return sprintf(data, "%u\n", count);
> + return sprintf(data, "%u\n", dimm->ue_count);
> }
>
> /* dimm/rank attribute files */
> @@ -666,7 +662,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> const char *data, size_t count)
> {
> struct mem_ctl_info *mci = to_mci(dev);
> - int cnt, row, chan, i;
> + struct dimm_info *dimm;
> + int row, chan;
> +
> mci->ue_mc = 0;
> mci->ce_mc = 0;
> mci->ue_noinfo_count = 0;
> @@ -682,11 +680,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
> ri->channels[chan]->ce_count = 0;
> }
>
> - cnt = 1;
> - for (i = 0; i < mci->n_layers; i++) {
> - cnt *= mci->layers[i].size;
> - memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
> - memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
> + mci_for_each_dimm(mci, dimm) {
> + dimm->ue_count = 0;
> + dimm->ce_count = 0;
> }
>
> mci->start_time = jiffies;
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 842080d7b33a..e0b90c6d7d63 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -347,11 +347,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
> mem_err->mem_dev_handle);
>
> index = get_dimm_smbios_index(mem_err->mem_dev_handle);
> - if (index >= 0) {
> + if (index >= 0)
> e->top_layer = index;
> - e->enable_per_layer_report = true;
> - }
> -
> }
> if (p > e->location)
> *(p - 1) = '\0';
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index 8beb6e834be9..8e72222e50b0 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -383,6 +383,9 @@ struct dimm_info {
> unsigned int csrow, cschannel; /* Points to the old API data */
>
> u16 smbios_handle; /* Handle for SMBIOS type 17 */
> +
> + u32 ce_count;
> + u32 ue_count;
> };
>
> /**
> @@ -453,8 +456,6 @@ struct errcount_attribute_data {
> * @location: location of the error
> * @label: label of the affected DIMM(s)
> * @other_detail: other driver-specific detail about the error
> - * @enable_per_layer_report: if false, the error affects all layers
> - * (typically, a memory controller error)
> */
> struct edac_raw_error_desc {
> /*
> @@ -475,7 +476,6 @@ struct edac_raw_error_desc {
> unsigned long syndrome;
> const char *msg;
> const char *other_detail;
> - bool enable_per_layer_report;
> };
>
> /* MEMORY controller information structure
> @@ -565,7 +565,6 @@ struct mem_ctl_info {
> */
> u32 ce_noinfo_count, ue_noinfo_count;
> u32 ue_mc, ce_mc;
> - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>
> struct completion complete;
>



Thanks,
Mauro