RE: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error handlers

From: Shiju Jose
Date: Fri Jun 27 2025 - 08:28:23 EST


>-----Original Message-----
>From: Terry Bowman <terry.bowman@xxxxxxx>
>Sent: 26 June 2025 23:43
>To: dave@xxxxxxxxxxxx; Jonathan Cameron <jonathan.cameron@xxxxxxxxxx>;
>dave.jiang@xxxxxxxxx; alison.schofield@xxxxxxxxx; dan.j.williams@xxxxxxxxx;
>bhelgaas@xxxxxxxxxx; Shiju Jose <shiju.jose@xxxxxxxxxx>;
>ming.li@xxxxxxxxxxxx; Smita.KoralahalliChannabasappa@xxxxxxx;
>rrichter@xxxxxxx; dan.carpenter@xxxxxxxxxx;
>PradeepVineshReddy.Kodamati@xxxxxxx; lukas@xxxxxxxxx;
>Benjamin.Cheatham@xxxxxxx;
>sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx; terry.bowman@xxxxxxx;
>linux-cxl@xxxxxxxxxxxxxxx
>Cc: linux-kernel@xxxxxxxxxxxxxxx; linux-pci@xxxxxxxxxxxxxxx
>Subject: [PATCH v10 14/17] cxl/pci: Introduce CXL Endpoint protocol error
>handlers
>
>CXL Endpoint protocol errors are currently handled using PCI error handlers. The
>CXL Endpoint requires CXL specific handling in the case of uncorrectable error
>(UCE) handling not provided by the PCI handlers.
>
>Add CXL specific handlers for CXL Endpoints. Rename the existing
>cxl_error_handlers to be pci_error_handlers to more correctly indicate the
>error type and follow naming consistency.
>
>The PCI handlers will be called if the CXL device is not trained for alternate
>protocol (CXL). Update the CXL Endpoint PCI handlers to call the CXL UCE
>handlers.
>
>The existing EP UCE handler includes checks for various results. These are no
>longer needed because CXL UCE recovery will not be attempted. Implement
>cxl_handle_ras() to return PCI_ERS_RESULT_NONE or PCI_ERS_RESULT_PANIC.
>The CXL UCE handler is called by cxl_do_recovery() that acts on the return
>value. In the case of the PCI handler path, call panic() if the result is
>PCI_ERS_RESULT_PANIC.
>
>Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>Reviewed-by: Kuppuswamy Sathyanarayanan
><sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx>
>---
> drivers/cxl/core/native_ras.c | 15 ++++---
> drivers/cxl/core/pci.c | 77 ++++++++++++++++++-----------------
> drivers/cxl/cxl.h | 4 ++
> drivers/cxl/cxlpci.h | 6 +--
> drivers/cxl/pci.c | 8 ++--
> 5 files changed, 59 insertions(+), 51 deletions(-)
>
[...]
>diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
>887b54cf3395..7209ffb5c2fe 100644
>--- a/drivers/cxl/core/pci.c
>+++ b/drivers/cxl/core/pci.c
>@@ -705,8 +705,8 @@ static void header_log_copy(void __iomem *ras_base,
>u32 *log)
> * Log the state of the RAS status registers and prepare them to log the
> * next error status. Return 1 if reset needed.
> */
>-static bool cxl_handle_ras(struct device *dev, u64 serial,
>- void __iomem *ras_base)
>+static pci_ers_result_t cxl_handle_ras(struct device *dev, u64 serial,
>+ void __iomem *ras_base)
> {
> u32 hl[CXL_HEADERLOG_SIZE_U32];
> void __iomem *addr;
>@@ -715,13 +715,13 @@ static bool cxl_handle_ras(struct device *dev, u64
>serial,
>
> if (!ras_base) {
> dev_warn_once(dev, "CXL RAS register block is not mapped");
>- return false;
>+ return PCI_ERS_RESULT_NONE;
> }
>
> addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
> status = readl(addr);
> if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>- return false;
>+ return PCI_ERS_RESULT_NONE;
>
> /* If multiple errors, log header points to first error from ctrl reg */
> if (hweight32(status) > 1) {
>@@ -738,7 +738,7 @@ static bool cxl_handle_ras(struct device *dev, u64 serial,
> trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>
>- return true;
>+ return PCI_ERS_RESULT_PANIC;
> }
>
> #ifdef CONFIG_PCIEAER_CXL
>@@ -833,13 +833,14 @@ static void cxl_handle_rdport_errors(struct
>cxl_dev_state *cxlds) static void cxl_handle_rdport_errors(struct cxl_dev_state
>*cxlds) { } #endif
>
>-void cxl_cor_error_detected(struct pci_dev *pdev)
>+void cxl_cor_error_detected(struct device *dev)
> {
>+ struct pci_dev *pdev = to_pci_dev(dev);
> struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>- struct device *dev = &cxlds->cxlmd->dev;
>+ struct device *cxlmd_dev = &cxlds->cxlmd->dev;
>
>- scoped_guard(device, dev) {
>- if (!dev->driver) {
>+ scoped_guard(device, cxlmd_dev) {
>+ if (!cxlmd_dev->driver) {
> dev_warn(&pdev->dev,
> "%s: memdev disabled, abort error
>handling\n",
> dev_name(dev));
>@@ -854,20 +855,26 @@ void cxl_cor_error_detected(struct pci_dev *pdev) }
>EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
>
>-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>- pci_channel_state_t state)
>+void pci_cor_error_detected(struct pci_dev *pdev)
> {
>- struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>- struct cxl_memdev *cxlmd = cxlds->cxlmd;
>- struct device *dev = &cxlmd->dev;
>- bool ue;
>+ cxl_cor_error_detected(&pdev->dev);
>+}
>+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
>
>- scoped_guard(device, dev) {
>- if (!dev->driver) {
>+pci_ers_result_t cxl_error_detected(struct device *dev) {
>+ struct pci_dev *pdev = to_pci_dev(dev);
>+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>+ struct device *cxlmd_dev = &cxlds->cxlmd->dev;
>+ pci_ers_result_t ue;
>+
>+ scoped_guard(device, cxlmd_dev) {
>+
Please remove the extra blank line.

>+ if (!cxlmd_dev->driver) {
> dev_warn(&pdev->dev,
> "%s: memdev disabled, abort error
>handling\n",
> dev_name(dev));

Thanks,
Shiju