Re: [PATCH v2 1/2] efi/cper, cxl: Decode CXL Protocol Error Section

From: Jonathan Cameron
Date: Thu Nov 03 2022 - 13:50:20 EST


On Fri, 28 Oct 2022 20:09:49 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx> wrote:

> Add support for decoding CXL Protocol Error Section as defined in UEFI 2.10
> Section N.2.13.
>
> Do the section decoding in a new cper_cxl.c file. This new file will be
> used in the future for more CXL CPERs decode support. Add this to the
> existing UEFI_CPER config.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>
A few really trivial things inline.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>

> ---
> v2:
> Added all fields for decoding. Printed DVSEC and Capability ID.
> Added additional CXL Agent Types based on UEFI 2.10.
> Named the unnamed union to agent addr.
> Changed field name from agent_addr -> rcrb_base_addr.
> subsystem_device_id -> subsystem_id.
> Commented why different union elements are relevant.
> Provided other comments wherever necessary.
> ---
> drivers/firmware/efi/Makefile | 2 +-
> drivers/firmware/efi/cper.c | 9 ++
> drivers/firmware/efi/cper_cxl.c | 152 ++++++++++++++++++++++++++++++++
> drivers/firmware/efi/cper_cxl.h | 66 ++++++++++++++
> 4 files changed, 228 insertions(+), 1 deletion(-)
> create mode 100644 drivers/firmware/efi/cper_cxl.c
> create mode 100644 drivers/firmware/efi/cper_cxl.h
>
> diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
> index 8d151e332584..4f332de54173 100644
> --- a/drivers/firmware/efi/Makefile
> +++ b/drivers/firmware/efi/Makefile
> @@ -19,7 +19,7 @@ endif
> obj-$(CONFIG_EFI_PARAMS_FROM_FDT) += fdtparams.o
> obj-$(CONFIG_EFI_ESRT) += esrt.o
> obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o
> -obj-$(CONFIG_UEFI_CPER) += cper.o
> +obj-$(CONFIG_UEFI_CPER) += cper.o cper_cxl.o
> obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o
> obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o
> subdir-$(CONFIG_EFI_STUB) += libstub
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index e4e5ea7ce910..181deb9af527 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -24,6 +24,7 @@
> #include <linux/bcd.h>
> #include <acpi/ghes.h>
> #include <ras/ras_event.h>
> +#include "cper_cxl.h"
>
> /*
> * CPER record ID need to be unique even after reboot, because record
> @@ -595,6 +596,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
> cper_print_fw_err(newpfx, gdata, fw_err);
> else
> goto err_section_too_small;
> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +
> + printk("%ssection_type: CXL Protocol Error\n", newpfx);
> + if (gdata->error_data_length >= sizeof(*prot_err))
> + cper_print_prot_err(newpfx, prot_err);
> + else
> + goto err_section_too_small;
> } else {
> const void *err = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> new file mode 100644
> index 000000000000..6c94af234be9
> --- /dev/null
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -0,0 +1,152 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * UEFI Common Platform Error Record (CPER) support for CXL Section.
> + *
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>
> + */
> +
> +#include <linux/cper.h>
> +#include "cper_cxl.h"
> +
> +#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> +#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1)
> +#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2)
> +#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
> +#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
> +#define PROT_ERR_VALID_DVSEC BIT_ULL(5)

Error log valid should probably be here. Bit 6.
We might not use it, but it's a bit odd to have all but one entry
from the spec defined.

> +
> +static const char * const prot_err_agent_type_strs[] = {
> + "Restricted CXL Device",
> + "Restricted CXL Host Downstream Port",
> + "CXL Device",
> + "CXL Logical Device",
> + "CXL Fabric Manager managed Logical Device",
> + "CXL Root Port",
> + "CXL Downstream Switch Port",
> + "CXL Upstream Switch Port",
> +};
> +
> +/*
> + * The layout of the enumeration and the values matches CXL Agent Type
> + * field in the UEFI 2.10 Section N.2.13,
> + */
> +enum {
> + RCD, /* Restricted CXL Device */
> + RCH_DP, /* Restricted CXL Host Downstream Port */
> + DEVICE, /* CXL Device */
> + LD, /* CXL Logical Device */
> + FMLD, /* CXL Fabric Manager managed Logical Device */
> + RP, /* CXL Root Port */
> + DSP, /* CXL Downstream Switch Port */
> + USP, /* CXL Upstream Switch Port */
> +};
I would move this above the prot_err_agent_type_strs and use

[RCD] = "Restricted CXL Device". Saves on the trivial effort
for a reviewer of checking the two match up.
Probably don't need the comments on the enum as well as the
strings that say the same thing...

> +
> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> +{
> + if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> + pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
> + prot_err->agent_type < ARRAY_SIZE(prot_err_agent_type_strs)
> + ? prot_err_agent_type_strs[prot_err->agent_type]
> + : "unknown");
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS) {
> + switch (prot_err->agent_type) {
> + /*
> + * According to UEFI 2.10 Section N.2.13, the term CXL Device
> + * is used to refer to Restricted CXL Device, CXL Device, CXL
> + * Logical Device or a CXL Fabric Manager Managed Logical
> + * Device.
> + */
> + case RCD:
> + case DEVICE:
> + case LD:
> + case FMLD:
> + case RP:
> + case DSP:
> + case USP:
> + pr_info("%s agent_address: %04x:%02x:%02x.%x\n",
> + pfx, prot_err->agent_addr.segment,
> + prot_err->agent_addr.bus,
> + prot_err->agent_addr.device,
> + prot_err->agent_addr.function);
> + break;
> + case RCH_DP:
> + pr_info("%s rcrb_base_address: 0x%016llx\n", pfx,
> + prot_err->agent_addr.rcrb_base_addr);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID) {
> + const __u8 *class_code;
> +
> + switch (prot_err->agent_type) {
> + case RCD:
> + case DEVICE:
> + case LD:
> + case FMLD:
> + case RP:
> + case DSP:
> + case USP:
> + pr_info("%s slot: %d\n", pfx,
> + prot_err->device_id.slot >> CPER_PCIE_SLOT_SHIFT);

Probably not in keeping with other CPER handling (because that's all ancient) but
I'm a great fan of FIELD_GET() and masks for cases like this.
I this particular case the slot field goes all they way to bit 15 so there
are no reserved bits up there, but I had to go check that which a FIELD_GET()/ mask
approach would have saved me doing.

> + pr_info("%s vendor_id: 0x%04x, device_id: 0x%04x\n",
> + pfx, prot_err->device_id.vendor_id,
> + prot_err->device_id.device_id);
> + pr_info("%s sub_vendor_id: 0x%04x, sub_device_id: 0x%04x\n",
> + pfx, prot_err->device_id.subsystem_vendor_id,
> + prot_err->device_id.subsystem_id);
> + class_code = prot_err->device_id.class_code;
> + pr_info("%s class_code: %02x%02x\n", pfx,
> + class_code[1], class_code[0]);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) {
> + switch (prot_err->agent_type) {
> + case RCD:
> + case DEVICE:
> + case LD:
> + case FMLD:
> + pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
> + prot_err->dev_serial_num.lower_dw,
> + prot_err->dev_serial_num.upper_dw);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_CAPABILITY) {
> + switch (prot_err->agent_type) {
> + case RCD:
> + case DEVICE:
> + case LD:
> + case FMLD:
> + case RP:
> + case DSP:
> + case USP:
> + print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4,
> + prot_err->capability,
> + sizeof(prot_err->capability), 0);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (prot_err->valid_bits & PROT_ERR_VALID_DVSEC) {
> + pr_info("%s DVSEC length: 0x%04x\n", pfx, prot_err->dvsec_len);
> +
> + pr_info("%s CXL DVSEC:\n", pfx);
> + print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, (prot_err + 1),

Excess brackets?

> + prot_err->dvsec_len, 0);
> + }
> +}
> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> new file mode 100644
> index 000000000000..86bfcf7909ec
> --- /dev/null
> +++ b/drivers/firmware/efi/cper_cxl.h
> @@ -0,0 +1,66 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * UEFI Common Platform Error Record (CPER) support for CXL Section.
> + *
> + * Copyright (C) 2022 Advanced Micro Devices, Inc.
> + *
> + * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>
> + */
> +
> +#ifndef LINUX_CPER_CXL_H
> +#define LINUX_CPER_CXL_H
> +
> +/* CXL Protocol Error Section */
> +#define CPER_SEC_CXL_PROT_ERR \
> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> + 0x4B, 0x77, 0x10, 0x48)
> +
> +#pragma pack(1)
> +
> +/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
> +struct cper_sec_prot_err {
> + u64 valid_bits;
> + u8 agent_type;
> + u8 reserved[7];
> +
> + /*
> + * Except for RCH Downstream Port, all the remaining CXL Agent
> + * types are uniquely identified by the PCIe compatible SBDF number.
> + */
> + union {
> + u64 rcrb_base_addr;
> + struct {
> + u8 function;
> + u8 device;
> + u8 bus;
> + u16 segment;
> + u8 reserved_1[3];
> + };
> + } agent_addr;
> +
> + struct {
> + u16 vendor_id;
> + u16 device_id;
> + u16 subsystem_vendor_id;
> + u16 subsystem_id;
> + u8 class_code[2];
> + u16 slot;
> + u8 reserved_1[4];
> + } device_id;
> +
> + struct {
> + u32 lower_dw;
> + u32 upper_dw;
> + } dev_serial_num;
> +
> + u8 capability[60];
> + u16 dvsec_len;
> + u16 err_len;
> + u8 reserved_2[4];
> +};
> +
> +#pragma pack()
> +
> +void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
> +
> +#endif //__CPER_CXL_