Re: [PATCH 3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

From: Smita Koralahalli
Date: Wed Jan 03 2024 - 16:12:22 EST


On 1/2/2024 9:58 AM, Ira Weiny wrote:
Smita Koralahalli wrote:
UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.

Add GHES support to detect CXL CPER Protocol record and cache error
severity, device_id, serial number and a pointer to CXL RAS capability
struct in struct cxl_cper_rec_data.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>
---
drivers/acpi/apei/ghes.c | 15 ++++++++++
drivers/firmware/efi/cper_cxl.c | 52 +++++++++++++++++++++++++++++++++
include/linux/cxl-event.h | 6 ++++
3 files changed, 73 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d6a85fbc0a8b..6471584b2e79 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -714,6 +714,18 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
cper_callback(event_type, &data);
}
+void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+ struct cxl_cper_rec_data data;
+
+ memset(&data, 0, sizeof(data));

Does this need to be done? Could cxl_cper_handle_prot_err_info() use
something like this initially?

*data = (struct cxl_cper_rec_data) {
.rec.hdr.dev_serial_num.lower_dw = prot_err->dev_serial_num.lower_dw;
.rec.hdr.dev_serial_num.upper_dw = prot_err->dev_serial_num.upper_dw;
};

The serial number is always set even if it is not valid.

Ok will copy serial number at the beginning of cxl_cper_handle_prot_err_info() and remove memset.


+
+ if (cxl_cper_handle_prot_err_info(gdata, &data))
+ return;
+
+ data.severity = gdata->error_severity;
+}
+
int cxl_cper_register_callback(cxl_cper_callback callback)
{
guard(rwsem_write)(&cxl_cper_rw_sem);
@@ -768,6 +780,9 @@ static bool ghes_do_proc(struct ghes *ghes,
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev);
}
+ else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ cxl_cper_handle_prot_err(gdata);
+ }
else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..3bc0b9f28c9e 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
*/
#include <linux/cper.h>
+#include <acpi/ghes.h>
#include "cper_cxl.h"
#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
@@ -176,3 +177,54 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
sizeof(cxl_ras->header_log), 0);
}
}
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_rec_data *data)
+{
+ struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ struct cper_cxl_event_devid *device_id = &data->rec.hdr.device_id;
+ struct cper_cxl_event_sn *dev_serial_num = &data->rec.hdr.dev_serial_num;
+ size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err(FW_WARN "Not a valid protocol error log\n");
+ return -EINVAL;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+ pr_err(FW_WARN "Not a valid Device ID\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device ID or agent address is only valid for CXL 1.1 device,
+ * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
+ * Managed Logical Device, CXL Root Port, CXL Downstream Switch
+ * Port, or CXL Upstream Switch Port.
+ */
+ if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
+ device_id->segment_num = prot_err->agent_addr.segment;
+ device_id->bus_num = prot_err->agent_addr.bus;
+ device_id->device_num = prot_err->agent_addr.device;
+ device_id->func_num = prot_err->agent_addr.function;
+ } else {
+ pr_err(FW_WARN "Not a valid agent type\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device serial number is only valid for CXL 1.1 device, CXL
+ * 2.0 device, CXL 2.0 Logical device, or CXL 2.0 Fabric Manager
+ * Managed Logical Device.
+ */
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+ prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
+ pr_warn(FW_WARN "No valid serial number present\n");
+
+ dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
+ dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+ data->cxl_ras = (struct cxl_ras_capability_regs *)((long)prot_err + size);

I think this is ok now because the cxl trace code processes the data
before returning (in next patch). But I'm a bit leary about passing a
pointer to data controlled by the acpi sub-system. At some point the
event may get cached to be processed by another thread and it might be
better to copy the struct here.

Ok will make the changes. Rewrote the struct in patch 1.


+
+ return 0;
+}
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 90d8390a73cb..7ba2dfd6619e 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -154,11 +154,17 @@ struct cxl_ras_capability_regs {
struct cxl_cper_rec_data {
struct cxl_cper_event_rec rec;
+ struct cxl_ras_capability_regs *cxl_ras;
+ int severity;

NIT: When I saw this without any addition to event type I wondered if the
series would bisect. I see it does because the record is not sent until
the next patch. But I wonder if the 2 patches would be best reversed.

You mean to say patch 4 to be 3 and 3 to be 4?

And since I haven't used severity yet, fix it by declaring severity to where it is used..


Also, should cxl_ras + severity be put in a protocol error sub-struct?
Does severity ever apply to event records?

No, not in any of the component event records. Only place is in "Event Record Flags" in Common Event Record Format (Table 8-42).

Addressed in patch 1 to make a sub-struct.

Thanks,
Smita


Ira

};
typedef void (*cxl_cper_callback)(enum cxl_event_type type,
struct cxl_cper_rec_data *data);
+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_rec_data *data);
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_callback(cxl_cper_callback callback);
int cxl_cper_unregister_callback(cxl_cper_callback callback);
--
2.17.1