Re: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers

From: Jonathan Cameron
Date: Thu Jun 12 2025 - 12:56:15 EST


On Tue, 3 Jun 2025 12:22:35 -0500
Terry Bowman <terry.bowman@xxxxxxx> wrote:

> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error handling not provided by the PCI handlers.
>
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.

I wonder if a rename precursor patch would be better here than doing it
all in one go?

Having not read the patch description thoroughly this had me confused ;)

>
> Keep the existing PCI Endpoint handlers. PCI handlers can be called if the
> CXL device is not trained for alternate protocol (CXL). Update the CXL
> Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable
> handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic().
>
> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>


>
> static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> {
> -

So this snuck in somewhere between upstream and here. If it is in this
set let's push the removal back to where it came from.

> return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
> }
>
> @@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
> #endif
>
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> + struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> + pci_ers_result_t ue;
> +
> + scoped_guard(device, cxlmd_dev) {
> if (!dev->driver) {
> dev_warn(&pdev->dev,
> "%s: memdev disabled, abort error handling\n",
> dev_name(dev));
> - return PCI_ERS_RESULT_DISCONNECT;
> + return PCI_ERS_RESULT_PANIC;
> }
>
> if (cxlds->rcd)
> @@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> ue = cxl_handle_endpoint_ras(cxlds);
> }
>
> -

Maybe something more in the patch description on why this chunk isn't relevant?
I guess we don't need the more complex handling as these are all panic :)

> - switch (state) {
> - case pci_channel_io_normal:
> - if (ue) {
> - device_release_driver(dev);
> - return PCI_ERS_RESULT_NEED_RESET;
> - }
> - return PCI_ERS_RESULT_CAN_RECOVER;
> - case pci_channel_io_frozen:
> - dev_warn(&pdev->dev,
> - "%s: frozen state error detected, disable CXL.mem\n",
> - dev_name(dev));
> - device_release_driver(dev);
> - return PCI_ERS_RESULT_NEED_RESET;
> - case pci_channel_io_perm_failure:
> - dev_warn(&pdev->dev,
> - "failure state error detected, request disconnect\n");
> - return PCI_ERS_RESULT_DISCONNECT;
> - }
> - return PCI_ERS_RESULT_NEED_RESET;
> + return ue;
> }
> EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");

> static int cxl_flit_size(struct pci_dev *pdev)
> {
> if (cxl_pci_flit_256(pdev))
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 0ef8c2068c0c..664f532cc838 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c

> @@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
> static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> {
> struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> + struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>
> if (!pdev) {
> pr_err("Failed to find the CXL device\n");
> @@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>
> if (err_info->severity == AER_CORRECTABLE) {
> int aer = pdev->aer_cap;
> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> - struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);

This code move seems somewhat unrelated? Maybe it's in the wrong patch and
becomes necessary later?

>
> if (aer)
> pci_clear_and_set_config_dword(pdev,
> aer + PCI_ERR_COR_STATUS,
> 0, PCI_ERR_COR_INTERNAL);
>
> - cxl_cor_error_detected(pdev);
> + cxl_cor_error_detected(&pdev->dev);
>
> pcie_clear_device_status(pdev);
> } else {