linux-next: manual merge of the amdgpu tree with the pci tree

From: Stephen Rothwell
Date: Mon Dec 07 2020 - 21:57:23 EST


Hi all,

Today's linux-next merge of the amdgpu tree got a conflict in:

drivers/pci/pcie/err.c

between commits:

8f1bbfbc3596 ("PCI/ERR: Rename reset_link() to reset_subordinates()")
0791721d8007 ("PCI/ERR: Use "bridge" for clarity in pcie_do_recovery()")
05e9ae19ab83 ("PCI/ERR: Add pci_walk_bridge() to pcie_do_recovery()")

from the pci tree and commit:

36a8901e900a ("PCI/ERR: Fix reset logic in pcie_do_recovery() call")

from the amdgpu tree.

I fixed it up (I think - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging. You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

--
Cheers,
Stephen Rothwell

diff --cc drivers/pci/pcie/err.c
index 510f31f0ef6d,4a2735b70fa6..000000000000
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@@ -146,61 -146,49 +146,82 @@@ out
return 0;
}

+/**
+ * pci_walk_bridge - walk bridges potentially AER affected
+ * @bridge: bridge which may be a Port, an RCEC, or an RCiEP
+ * @cb: callback to be called for each device found
+ * @userdata: arbitrary pointer to be passed to callback
+ *
+ * If the device provided is a bridge, walk the subordinate bus, including
+ * any bridged devices on buses under this bus. Call the provided callback
+ * on each device found.
+ *
+ * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
+ * call the callback on the device itself.
+ */
+static void pci_walk_bridge(struct pci_dev *bridge,
+ int (*cb)(struct pci_dev *, void *),
+ void *userdata)
+{
+ if (bridge->subordinate)
+ pci_walk_bus(bridge->subordinate, cb, userdata);
+ else
+ cb(bridge, userdata);
+}
+
pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
- pci_channel_state_t state,
- pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
+ pci_channel_state_t state,
+ pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
{
+ int type = pci_pcie_type(dev);
+ struct pci_dev *bridge;
pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
- struct pci_bus *bus;
+ struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);

/*
- * Error recovery runs on all subordinates of the first downstream port.
- * If the downstream port detected the error, it is cleared at the end.
+ * If the error was detected by a Root Port, Downstream Port, RCEC,
+ * or RCiEP, recovery runs on the device itself. For Ports, that
+ * also includes any subordinate devices.
+ *
+ * If it was detected by another device (Endpoint, etc), recovery
+ * runs on the device and anything else under the same Port, i.e.,
+ * everything under "bridge".
*/
- if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
- pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
- dev = dev->bus->self;
- bus = dev->subordinate;
+ if (type == PCI_EXP_TYPE_ROOT_PORT ||
+ type == PCI_EXP_TYPE_DOWNSTREAM ||
+ type == PCI_EXP_TYPE_RC_EC ||
+ type == PCI_EXP_TYPE_RC_END)
+ bridge = dev;
+ else
+ bridge = pci_upstream_bridge(dev);

- pci_dbg(dev, "broadcast error_detected message\n");
+ pci_dbg(bridge, "broadcast error_detected message\n");
if (state == pci_channel_io_frozen) {
- pci_walk_bus(bus, report_frozen_detected, &status);
+ pci_walk_bridge(bridge, report_frozen_detected, &status);
+ /*
+ * After resetting the link using reset_link() call, the
+ * possible value of error status is either
+ * PCI_ERS_RESULT_DISCONNECT (failure case) or
+ * PCI_ERS_RESULT_NEED_RESET (success case).
+ * So ignore the return value of report_error_detected()
+ * call for fatal errors.
+ *
+ * In EDR mode, since AER and DPC Capabilities are owned by
+ * firmware, reported_error_detected() will return error
+ * status PCI_ERS_RESULT_NO_AER_DRIVER. Continuing
+ * pcie_do_recovery() with error status as
+ * PCI_ERS_RESULT_NO_AER_DRIVER will report recovery failure
+ * irrespective of recovery status. But successful reset_link()
+ * call usually recovers all fatal errors. So ignoring the
+ * status result of report_error_detected() also helps EDR based
+ * error recovery.
+ */
- status = reset_link(dev);
+ status = reset_subordinates(bridge);
- if (status != PCI_ERS_RESULT_RECOVERED) {
+ if (status == PCI_ERS_RESULT_RECOVERED) {
+ status = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ status = PCI_ERS_RESULT_DISCONNECT;
- pci_warn(dev, "link reset failed\n");
+ pci_warn(bridge, "subordinate device reset failed\n");
goto failed;
}
} else {
@@@ -215,13 -203,25 +236,25 @@@

if (status == PCI_ERS_RESULT_NEED_RESET) {
/*
- * TODO: Should call platform-specific
- * functions to reset slot before calling
- * drivers' slot_reset callbacks?
+ * TODO: Optimize the call to pci_reset_bus()
+ *
+ * There are two components to pci_reset_bus().
+ *
+ * 1. Do platform specific slot/bus reset.
+ * 2. Save/Restore all devices in the bus.
+ *
+ * For hotplug capable devices and fatal errors,
+ * device is already in reset state due to link
+ * reset. So repeating platform specific slot/bus
+ * reset via pci_reset_bus() call is redundant. So
+ * can optimize this logic and conditionally call
+ * pci_reset_bus().
*/
+ pci_reset_bus(dev);
+
status = PCI_ERS_RESULT_RECOVERED;
- pci_dbg(dev, "broadcast slot_reset message\n");
- pci_walk_bus(bus, report_slot_reset, &status);
+ pci_dbg(bridge, "broadcast slot_reset message\n");
+ pci_walk_bridge(bridge, report_slot_reset, &status);
}

if (status != PCI_ERS_RESULT_RECOVERED)

Attachment: pgp_WuELHgTan.pgp
Description: OpenPGP digital signature