Re: [PATCHv2 pci-next 1/2] PCI/AER: correctable error message as KERN_INFO

From: Sathyanarayanan Kuppuswamy
Date: Fri Mar 17 2023 - 14:50:29 EST




On 3/17/23 10:51 AM, Grant Grundler wrote:
> Since correctable errors have been corrected (and counted), the dmesg output
> should not be reported as a warning, but rather as "informational".
>
> Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
> station, the dmesg buffer can be spammed with correctable errors, 717 bytes
> per instance, potentially many MB per day.

Why don't you investigate why you are getting so many correctable errors?
Isn't solving the problem preferable to hiding the logs?

>
> Given the "WARN" priority, these messages have already confused the typical
> user that stumbles across them, support staff (triaging feedback reports),
> and more than a few linux kernel devs. Changing to INFO will hide these
> messages from most audiences.
>
> Signed-off-by: Grant Grundler <grundler@xxxxxxxxxxxx>
> ---
> drivers/pci/pcie/aer.c | 29 +++++++++++++++++++----------
> 1 file changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index f6c24ded134c..cb6b96233967 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -687,23 +687,29 @@ static void __aer_print_error(struct pci_dev *dev,
> {
> const char **strings;
> unsigned long status = info->status & ~info->mask;
> - const char *level, *errmsg;
> int i;
>
> if (info->severity == AER_CORRECTABLE) {
> strings = aer_correctable_error_string;
> - level = KERN_WARNING;
> + pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> + info->status, info->mask);
> } else {
> strings = aer_uncorrectable_error_string;
> - level = KERN_ERR;
> + pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> + info->status, info->mask);
> }
>
> for_each_set_bit(i, &status, 32) {
> - errmsg = strings[i];
> + const char *errmsg = strings[i];
> +
> if (!errmsg)
> errmsg = "Unknown Error Bit";
>
> - pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
> + if (info->severity == AER_CORRECTABLE)
> + pci_info(dev, " [%2d] %-22s%s\n", i, errmsg,
> + info->first_error == i ? " (First)" : "");
> + else
> + pci_err(dev, " [%2d] %-22s%s\n", i, errmsg,
> info->first_error == i ? " (First)" : "");
> }
> pci_dev_aer_stats_incr(dev, info);
> @@ -724,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> agent = AER_GET_AGENT(info->severity, info->status);
>
> - level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> + level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
>
> pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> aer_error_severity_string[info->severity],
> @@ -797,14 +803,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
> info.mask = mask;
> info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>
> - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> __aer_print_error(dev, &info);
> - pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> - aer_error_layer[layer], aer_agent_string[agent]);
>
> - if (aer_severity != AER_CORRECTABLE)
> + if (aer_severity == AER_CORRECTABLE) {
> + pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
> + aer_error_layer[layer], aer_agent_string[agent]);
> + } else {
> + pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> + aer_error_layer[layer], aer_agent_string[agent]);
> pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> aer->uncor_severity);
> + }
>
> if (tlp_header_valid)
> __print_tlp_header(dev, &aer->header_log);

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer