Re: [PATCH 3/3] EDAC, ghes: Make it a proper module

From: Mauro Carvalho Chehab
Date: Wed Jul 26 2017 - 06:24:16 EST


Em Wed, 26 Jul 2017 10:48:27 +0200
Borislav Petkov <bp@xxxxxxxxx> escreveu:

> From: Borislav Petkov <bp@xxxxxxx>
>
> Register with the GHES notifier chain so that there's no need to call
> into the module with ghes_edac_report_mem_error().

Hmm... I'm not seeing any implementation that would allow setting
between firmware first, hardware first or "auto", as we've discussed.

>
> Signed-off-by: Borislav Petkov <bp@xxxxxxx>
> ---
> drivers/acpi/apei/ghes.c | 18 +++----
> drivers/edac/Kconfig | 4 +-
> drivers/edac/ghes_edac.c | 129 ++++++++++++++++++++++++-----------------------
> include/acpi/ghes.h | 25 ---------
> 4 files changed, 74 insertions(+), 102 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 3f05f5ff3461..37cd698cacd2 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -475,11 +475,11 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
> static void ghes_do_proc(struct ghes *ghes,
> const struct acpi_hest_generic_status *estatus)
> {
> - int sev, sec_sev;
> struct acpi_hest_generic_data *gdata;
> guid_t *sec_type;
> guid_t *fru_id = &NULL_UUID_LE;
> char *fru_text = "";
> + int sev, sec_sev;
>
> sev = ghes_severity(estatus->error_severity);
> apei_estatus_for_each_section(estatus, gdata) {
> @@ -494,7 +494,8 @@ static void ghes_do_proc(struct ghes *ghes,
> if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
> struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>
> - ghes_edac_report_mem_error(ghes, sev, mem_err);
> +
> + atomic_notifier_call_chain(&ghes_edac_chain, sev, &mem_err);
>
> arch_apei_report_mem_error(sev, mem_err);
> ghes_handle_memory_failure(gdata, sev);
> @@ -1153,10 +1154,6 @@ static int ghes_probe(struct platform_device *ghes_dev)
> goto err;
> }
>
> - rc = ghes_edac_register(ghes, &ghes_dev->dev);
> - if (rc < 0)
> - goto err;
> -
> switch (generic->notify.type) {
> case ACPI_HEST_NOTIFY_POLLED:
> setup_deferrable_timer(&ghes->timer, ghes_poll_func,
> @@ -1169,13 +1166,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
> if (rc) {
> pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
> generic->header.source_id);
> - goto err_edac_unreg;
> + goto err;
> }
> rc = request_irq(ghes->irq, ghes_irq_func, 0, "GHES IRQ", ghes);
> if (rc) {
> pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
> generic->header.source_id);
> - goto err_edac_unreg;
> + goto err;
> }
> break;
>
> @@ -1204,8 +1201,7 @@ static int ghes_probe(struct platform_device *ghes_dev)
> ghes_proc(ghes);
>
> return 0;
> -err_edac_unreg:
> - ghes_edac_unregister(ghes);
> +
> err:
> if (ghes) {
> ghes_fini(ghes);
> @@ -1255,8 +1251,6 @@ static int ghes_remove(struct platform_device *ghes_dev)
>
> ghes_fini(ghes);
>
> - ghes_edac_unregister(ghes);
> -
> kfree(ghes);
>
> platform_set_drvdata(ghes_dev, NULL);
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 96afb2aeed18..fdd8278ca89a 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -53,8 +53,8 @@ config EDAC_DECODE_MCE
> has been initialized.
>
> config EDAC_GHES
> - bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
> - depends on ACPI_APEI_GHES && (EDAC=y)
> + tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC"
> + depends on ACPI_APEI_GHES
> help
> Not all machines support hardware-driven error report. Some of those
> provide a BIOS-driven error report mechanism via ACPI, using the
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index 6f80eb65c26c..ecd34b8bd27e 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -5,6 +5,9 @@
> * License version 2.
> *
> * Copyright (c) 2013 by Mauro Carvalho Chehab
> + * (c) 2017 Borislav Petkov
> + *
> + * Borislav Petkov: turn it into a proper module.
> *
> * Red Hat Inc. http://www.redhat.com
> */
> @@ -28,10 +31,10 @@ struct ghes_edac_pvt {
> char msg[80];
> };
>
> -static LIST_HEAD(ghes_reglist);
> -static DEFINE_MUTEX(ghes_edac_lock);
> -static int ghes_edac_mc_num;
> +static struct ghes_edac_pvt *ghes_pvt;
>
> +/* Hand it into EDAC's core so that we have a device to operate on. */
> +static struct device dummy_dev;
>
> /* Memory Device - Type 17 of SMBIOS spec */
> struct memdev_dmi_entry {
> @@ -163,24 +166,21 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
> }
> }
>
> -void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
> - struct cper_sec_mem_err *mem_err)
> +static int report_mem_error(struct notifier_block *nb, unsigned long sev, void *data)
> {
> + struct cper_sec_mem_err *mem_err = data;
> enum hw_event_mc_err_type type;
> struct edac_raw_error_desc *e;
> struct mem_ctl_info *mci;
> - struct ghes_edac_pvt *pvt = NULL;
> - char *p;
> + struct ghes_edac_pvt *pvt = ghes_pvt;
> u8 grain_bits;
> + char *p;
>
> - list_for_each_entry(pvt, &ghes_reglist, list) {
> - if (ghes == pvt->ghes)
> - break;
> - }
> if (!pvt) {
> - pr_err("Internal error: Can't find EDAC structure\n");
> - return;
> + edac_pr_err("Internal error: Can't find EDAC structure\n");
> + return NOTIFY_DONE;
> }
> +
> mci = pvt->mci;
> e = &mci->error_desc;
>
> @@ -400,23 +400,40 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
>
> /* Report the error via EDAC API */
> edac_raw_mc_handle_error(type, mci, e);
> +
> + return NOTIFY_DONE;
> }
> -EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
>
> -int ghes_edac_register(struct ghes *ghes, struct device *dev)
> +static struct notifier_block ghes_nb = {
> + .notifier_call = report_mem_error,
> +};
> +
> +static const char * const fake_msg =
> +"This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"
> +"Unfortunately, not all BIOSes reflect the memory layout correctly.\n"
> +"So, the end result of using this driver varies from vendor to vendor.\n"
> +"If you find incorrect reports, please contact your hardware vendor\n"
> +"to correct its BIOS.";
> +
> +static const char * const super_crap_msg =
> +"This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"
> +"Its SMBIOS info is wrong. It is doubtful that the error report would\n"
> +"work on such system. Use this driver with caution.";

I would also add a note saying that the BIOS may be masking errors.

> +
> +static int __init ghes_edac_register(void)
> {
> + struct ghes_edac_pvt *pvt = ghes_pvt;
> bool fake = false;
> int rc, num_dimm = 0;
> struct mem_ctl_info *mci;
> struct edac_mc_layer layers[1];
> - struct ghes_edac_pvt *pvt;
> struct ghes_edac_dimm_fill dimm_fill;
>
> /* Get the number of DIMMs */
> dmi_walk(ghes_edac_count_dimms, &num_dimm);
>
> /* Check if we've got a bogus BIOS */
> - if (num_dimm == 0) {
> + if (!num_dimm) {
> fake = true;
> num_dimm = 1;
> }
> @@ -429,21 +446,17 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
> * We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
> * to avoid duplicated memory controller numbers
> */
> - mutex_lock(&ghes_edac_lock);
> - mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
> - sizeof(*pvt));
> + mci = edac_mc_alloc(1, ARRAY_SIZE(layers), layers, sizeof(*pvt));
> if (!mci) {
> - pr_info("Can't allocate memory for EDAC data\n");
> - mutex_unlock(&ghes_edac_lock);
> + edac_pr_err("Can't allocate memory for EDAC data\n");
> return -ENOMEM;
> }
>
> pvt = mci->pvt_info;
> memset(pvt, 0, sizeof(*pvt));
> - list_add_tail(&pvt->list, &ghes_reglist);
> - pvt->ghes = ghes;
> pvt->mci = mci;
> - mci->pdev = dev;
> +
> + mci->pdev = &dummy_dev;
>
> mci->mtype_cap = MEM_FLAG_EMPTY;
> mci->edac_ctl_cap = EDAC_FLAG_NONE;
> @@ -452,21 +465,12 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
> mci->ctl_name = "ghes_edac";
> mci->dev_name = "ghes";
>
> - if (!ghes_edac_mc_num) {
> - if (!fake) {
> - pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
> - pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
> - pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
> - pr_info("If you find incorrect reports, please contact your hardware vendor\n");
> - pr_info("to correct its BIOS.\n");
> - pr_info("This system has %d DIMM sockets.\n",
> - num_dimm);
> - } else {
> - pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
> - pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
> - pr_info("work on such system. Use this driver with caution\n");
> - }
> - }
> + if (!fake)
> + edac_pr_info("%s\n", fake_msg);
> + else
> + edac_pr_info("%s\n", super_crap_msg);
> +
> + edac_pr_info("This system has %d DIMM sockets.\n", num_dimm);
>
> if (!fake) {
> /*
> @@ -475,13 +479,11 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
> * Keep it in blank for the other memory controllers, as
> * there's no reliable way to properly credit each DIMM to
> * the memory controller, as different BIOSes fill the
> - * DMI bank location fields on different ways
> + * DMI bank location fields in different ways.
> */
> - if (!ghes_edac_mc_num) {
> - dimm_fill.count = 0;
> - dimm_fill.mci = mci;
> - dmi_walk(ghes_edac_dmidecode, &dimm_fill);
> - }
> + dimm_fill.count = 0;
> + dimm_fill.mci = mci;
> + dmi_walk(ghes_edac_dmidecode, &dimm_fill);
> } else {
> struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
> mci->n_layers, 0, 0, 0);
> @@ -495,30 +497,31 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
>
> rc = edac_mc_add_mc(mci);
> if (rc < 0) {
> - pr_info("Can't register at EDAC core\n");
> + edac_pr_err("Can't register with EDAC core\n");
> edac_mc_free(mci);
> - mutex_unlock(&ghes_edac_lock);
> return -ENODEV;
> }
>
> - ghes_edac_mc_num++;
> - mutex_unlock(&ghes_edac_lock);
> + ghes_register_edac_chain(&ghes_nb);
> +
> return 0;
> }
> -EXPORT_SYMBOL_GPL(ghes_edac_register);
> +module_init(ghes_edac_register);
>
> -void ghes_edac_unregister(struct ghes *ghes)
> +static void __exit ghes_edac_unregister(void)
> {
> struct mem_ctl_info *mci;
> - struct ghes_edac_pvt *pvt, *tmp;
> -
> - list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) {
> - if (ghes == pvt->ghes) {
> - mci = pvt->mci;
> - edac_mc_del_mc(mci->pdev);
> - edac_mc_free(mci);
> - list_del(&pvt->list);
> - }
> - }
> +
> + ghes_unregister_edac_chain(&ghes_nb);
> +
> + mci = find_mci_by_dev(&dummy_dev);
> + WARN_ON(!mci);
> +
> + edac_mc_del_mc(mci->pdev);
> + edac_mc_free(mci);
> +
> }
> -EXPORT_SYMBOL_GPL(ghes_edac_unregister);
> +module_exit(ghes_edac_unregister);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("GHES error decoding module");
> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
> index 506b6a197738..c02b8eb91bd6 100644
> --- a/include/acpi/ghes.h
> +++ b/include/acpi/ghes.h
> @@ -51,31 +51,6 @@ enum {
> GHES_SEV_PANIC = 0x3,
> };
>
> -/* From drivers/edac/ghes_edac.c */
> -
> -#ifdef CONFIG_EDAC_GHES
> -void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
> - struct cper_sec_mem_err *mem_err);
> -
> -int ghes_edac_register(struct ghes *ghes, struct device *dev);
> -
> -void ghes_edac_unregister(struct ghes *ghes);
> -
> -#else
> -static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
> - struct cper_sec_mem_err *mem_err)
> -{
> -}
> -
> -static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
> -{
> - return 0;
> -}
> -
> -static inline void ghes_edac_unregister(struct ghes *ghes)
> -{
> -}
> -#endif
> void ghes_register_edac_chain(struct notifier_block *nb);
> void ghes_unregister_edac_chain(struct notifier_block *nb);
>



Thanks,
Mauro