Re: [PATCH v9 04/16] PCI/AER: Dequeue forwarded CXL error

From: Bowman, Terry
Date: Fri Jun 06 2025 - 17:09:12 EST




On 6/6/2025 10:57 AM, Dave Jiang wrote:
>
> On 6/3/25 10:22 AM, Terry Bowman wrote:
>> The AER driver is now designed to forward CXL protocol errors to the CXL
>> driver. Update the CXL driver with functionality to dequeue the forwarded
>> CXL error from the kfifo. Also, update the CXL driver to begin the protocol
>> error handling processing using the work received from the FIFO.
>>
>> Introduce function cxl_prot_err_work_fn() to dequeue work forwarded by the
>> AER service driver. This will begin the CXL protocol error processing
>> with a call to cxl_handle_prot_error().
>>
>> Update cxl/core/ras.c by adding cxl_rch_handle_error_iter() that was
>> previously in the AER driver.
>>
>> Introduce sbdf_to_pci() to take the SBDF values from 'struct cxl_prot_error_info'
>> and use in discovering the erring PCI device. Make scope based reference
>> increments/decrements for the discovered PCI device and the associated
>> CXL device.
>>
>> Implement cxl_handle_prot_error() to differentiate between Restricted CXL
>> Host (RCH) protocol errors and CXL virtual host (VH) protocol errors.
>> RCH errors will be processed with a call to walk the associated Root
>> Complex Event Collector's (RCEC) secondary bus looking for the Root Complex
>> Integrated Endpoint (RCiEP) to handle the RCH error. Export pcie_walk_rcec()
>> so the CXL driver can walk the RCEC's downstream bus, searching for
>> the RCiEP.
>>
>> VH correctable error (CE) processing will call the CXL CE handler. VH
>> uncorrectable errors (UCE) will call cxl_do_recovery(), implemented as a
>> stub for now and to be updated in future patch. Export pci_aer_clean_fatal_status()
>> and pci_clean_device_status() used to clean up AER status after handling.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx>
>> ---
>> drivers/cxl/core/ras.c | 92 +++++++++++++++++++++++++++++++++++++++++
>> drivers/pci/pci.c | 1 +
>> drivers/pci/pci.h | 8 ----
>> drivers/pci/pcie/aer.c | 1 +
>> drivers/pci/pcie/rcec.c | 1 +
>> include/linux/aer.h | 2 +
>> include/linux/pci.h | 10 +++++
>> 7 files changed, 107 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index d35525e79e04..9ed5c682e128 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>> @@ -110,8 +110,100 @@ static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>>
>> #ifdef CONFIG_PCIEAER_CXL
>>
>> +static void cxl_do_recovery(struct pci_dev *pdev)
>> +{
>> +}
>> +
>> +static int cxl_rch_handle_error_iter(struct pci_dev *pdev, void *data)
>> +{
>> + struct cxl_prot_error_info *err_info = data;
>> + struct pci_dev *pdev_ref __free(pci_dev_put) = pci_dev_get(pdev);
>> + struct cxl_dev_state *cxlds;
>> +
>> + /*
>> + * The capability, status, and control fields in Device 0,
>> + * Function 0 DVSEC control the CXL functionality of the
>> + * entire device (CXL 3.0, 8.1.3).
>> + */
>> + if (pdev->devfn != PCI_DEVFN(0, 0))
>> + return 0;
>> +
>> + /*
>> + * CXL Memory Devices must have the 502h class code set (CXL
>> + * 3.0, 8.1.12.1).
>> + */
>> + if ((pdev->class >> 8) != PCI_CLASS_MEMORY_CXL)
> Should use FIELD_GET() to be consistent with the rest of CXL code base
>
>> + return 0;
>> +
>> + if (!is_cxl_memdev(&pdev->dev) || !pdev->dev.driver)
> I think you need to hold the pdev->dev lock while checking if the driver exists.
>
>> + return 0;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> + struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
> Maybe a comment on why cxlmd->dev ref is needed here.
Ooof. This is coded incorrectly. This should have been using the pdev/cxlds device (pdev->dev/cxlds->dev)
instead of cxlmd->dev. CXL EP RAS is mapped using the PCI device dev (same as cxlds->dev) as the host.
The pdev ref count further above works making it functionally ok but this line is not doing anything to help with the
reference counting, is misleading, and just needs to be fixed. Unfortunately, this broken EP reference counting
is found later in the driver to protect accesses to EP mapped RAS.

Terry
>> +
>> + if (err_info->severity == AER_CORRECTABLE)
>> + cxl_cor_error_detected(pdev);
>> + else
>> + cxl_do_recovery(pdev);
>> +
>> + return 1;
>> +}
>> +
>> +static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
>> +{
>> + unsigned int devfn = PCI_DEVFN(err_info->device,
>> + err_info->function);
>> + struct pci_dev *pdev __free(pci_dev_put) =
>> + pci_get_domain_bus_and_slot(err_info->segment,
>> + err_info->bus,
>> + devfn);
> Looks like DanC already caught that. Maybe have this function return with a ref held. I would also add a comment for the function mention that the caller need to put the device.
>
>> + return pdev;
>> +}
>> +
>> +static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
>> +
>> + if (!pdev) {
>> + pr_err("Failed to find the CXL device\n");
>> + return;
>> + }
>> +
>> + /*
>> + * Internal errors of an RCEC indicate an AER error in an
>> + * RCH's downstream port. Check and handle them in the CXL.mem
>> + * device driver.
>> + */
>> + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_EC)
>> + return pcie_walk_rcec(pdev, cxl_rch_handle_error_iter, err_info);
>> +
> cxl_rch_handle_error_iter() holds the pdev device lock when handling errors. Does the code block below need locking?
>
> DJ
>
>> + if (err_info->severity == AER_CORRECTABLE) {
>> + int aer = pdev->aer_cap;
>> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>> + struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>> +
>> + if (aer)
>> + pci_clear_and_set_config_dword(pdev,
>> + aer + PCI_ERR_COR_STATUS,
>> + 0, PCI_ERR_COR_INTERNAL);
>> +
>> + cxl_cor_error_detected(pdev);
>> +
>> + pcie_clear_device_status(pdev);
>> + } else {
>> + cxl_do_recovery(pdev);
>> + }
>> +}
>> +
>> static void cxl_prot_err_work_fn(struct work_struct *work)
>> {
>> + struct cxl_prot_err_work_data wd;
>> +
>> + while (cxl_prot_err_kfifo_get(&wd)) {
>> + struct cxl_prot_error_info *err_info = &wd.err_info;
>> +
>> + cxl_handle_prot_error(err_info);
>> + }
>> }
>>
>> #else
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index e77d5b53c0ce..524ac32b744a 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -2328,6 +2328,7 @@ void pcie_clear_device_status(struct pci_dev *dev)
>> pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
>> pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
>> }
>> +EXPORT_SYMBOL_NS_GPL(pcie_clear_device_status, "CXL");
>> #endif
>>
>> /**
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index d6296500b004..3c54a5ed803e 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -649,16 +649,10 @@ static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; }
>> void pci_rcec_init(struct pci_dev *dev);
>> void pci_rcec_exit(struct pci_dev *dev);
>> void pcie_link_rcec(struct pci_dev *rcec);
>> -void pcie_walk_rcec(struct pci_dev *rcec,
>> - int (*cb)(struct pci_dev *, void *),
>> - void *userdata);
>> #else
>> static inline void pci_rcec_init(struct pci_dev *dev) { }
>> static inline void pci_rcec_exit(struct pci_dev *dev) { }
>> static inline void pcie_link_rcec(struct pci_dev *rcec) { }
>> -static inline void pcie_walk_rcec(struct pci_dev *rcec,
>> - int (*cb)(struct pci_dev *, void *),
>> - void *userdata) { }
>> #endif
>>
>> #ifdef CONFIG_PCI_ATS
>> @@ -967,7 +961,6 @@ void pci_no_aer(void);
>> void pci_aer_init(struct pci_dev *dev);
>> void pci_aer_exit(struct pci_dev *dev);
>> extern const struct attribute_group aer_stats_attr_group;
>> -void pci_aer_clear_fatal_status(struct pci_dev *dev);
>> int pci_aer_clear_status(struct pci_dev *dev);
>> int pci_aer_raw_clear_status(struct pci_dev *dev);
>> void pci_save_aer_state(struct pci_dev *dev);
>> @@ -976,7 +969,6 @@ void pci_restore_aer_state(struct pci_dev *dev);
>> static inline void pci_no_aer(void) { }
>> static inline void pci_aer_init(struct pci_dev *d) { }
>> static inline void pci_aer_exit(struct pci_dev *d) { }
>> -static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>> static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
>> static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
>> static inline void pci_save_aer_state(struct pci_dev *dev) { }
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 5350fa5be784..6e88331c6303 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -290,6 +290,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>> if (status)
>> pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
>> }
>> +EXPORT_SYMBOL_GPL(pci_aer_clear_fatal_status);
>>
>> /**
>> * pci_aer_raw_clear_status - Clear AER error registers.
>> diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c
>> index d0bcd141ac9c..fb6cf6449a1d 100644
>> --- a/drivers/pci/pcie/rcec.c
>> +++ b/drivers/pci/pcie/rcec.c
>> @@ -145,6 +145,7 @@ void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *),
>>
>> walk_rcec(walk_rcec_helper, &rcec_data);
>> }
>> +EXPORT_SYMBOL_NS_GPL(pcie_walk_rcec, "CXL");
>>
>> void pci_rcec_init(struct pci_dev *dev)
>> {
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 550407240ab5..c9a18eca16f8 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -77,12 +77,14 @@ struct cxl_prot_err_work_data {
>>
>> #if defined(CONFIG_PCIEAER)
>> int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>> +void pci_aer_clear_fatal_status(struct pci_dev *dev);
>> int pcie_aer_is_native(struct pci_dev *dev);
>> #else
>> static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>> {
>> return -EINVAL;
>> }
>> +static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>> static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>> #endif
>>
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index bff3009f9ff0..cd53715d53f3 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1806,6 +1806,9 @@ extern bool pcie_ports_native;
>>
>> int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
>> bool use_lt);
>> +void pcie_walk_rcec(struct pci_dev *rcec,
>> + int (*cb)(struct pci_dev *, void *),
>> + void *userdata);
>> #else
>> #define pcie_ports_disabled true
>> #define pcie_ports_native false
>> @@ -1816,8 +1819,15 @@ static inline int pcie_set_target_speed(struct pci_dev *port,
>> {
>> return -EOPNOTSUPP;
>> }
>> +
>> +static inline void pcie_walk_rcec(struct pci_dev *rcec,
>> + int (*cb)(struct pci_dev *, void *),
>> + void *userdata) { }
>> +
>> #endif
>>
>> +void pcie_clear_device_status(struct pci_dev *dev);
>> +
>> #define PCIE_LINK_STATE_L0S (BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */
>> #define PCIE_LINK_STATE_L1 BIT(2) /* L1 state */
>> #define PCIE_LINK_STATE_L1_1 BIT(3) /* ASPM L1.1 state */