Re: [PATCH v12 01/20] cxl/port: Fix release of RCD endpoints

From: Dan Williams
Date: Fri Oct 27 2023 - 21:39:39 EST


Dan Williams wrote:
> Robert Richter wrote:
> > Dan,
> [..]
> >
> > delete_endpoint() is called here, but the uport etc. is not unbound.
> > Which means this is not true:
> >
> > if (parent->driver && !endpoint->dead) {
> > ...
> >
> > I don't remember this with my patch. The parent is there different, so
> > that could be the reason.
> >
> > I could not yet look into more detail but wanted to let you know. Will
> > continue.
>
> Apologies, I didn't have that regression going, I think I see the issue.
> Thanks for the heads up.

Here is the incremental fix on top of the lifetime fix:

diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 6230ddfc0be8..0fe915ec2cc2 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1217,30 +1217,39 @@ static struct device *grandparent(struct device *dev)
return NULL;
}

+static struct device *endpoint_host(struct cxl_port *endpoint)
+{
+ struct cxl_port *port = to_cxl_port(endpoint->dev.parent);
+
+ if (is_cxl_root(port))
+ return port->uport_dev;
+ return &port->dev;
+}
+
static void delete_endpoint(void *data)
{
struct cxl_memdev *cxlmd = data;
struct cxl_port *endpoint = cxlmd->endpoint;
- struct device *parent = endpoint->dev.parent;
+ struct device *host = endpoint_host(endpoint);

- device_lock(parent);
- if (parent->driver && !endpoint->dead) {
- devm_release_action(parent, cxl_unlink_parent_dport, endpoint);
- devm_release_action(parent, cxl_unlink_uport, endpoint);
- devm_release_action(parent, unregister_port, endpoint);
+ device_lock(host);
+ if (host->driver && !endpoint->dead) {
+ devm_release_action(host, cxl_unlink_parent_dport, endpoint);
+ devm_release_action(host, cxl_unlink_uport, endpoint);
+ devm_release_action(host, unregister_port, endpoint);
}
cxlmd->endpoint = NULL;
- device_unlock(parent);
+ device_unlock(host);
put_device(&endpoint->dev);
- put_device(parent);
+ put_device(host);
}

int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint)
{
- struct device *parent = endpoint->dev.parent;
+ struct device *host = endpoint_host(endpoint);
struct device *dev = &cxlmd->dev;

- get_device(parent);
+ get_device(host);
get_device(&endpoint->dev);
cxlmd->endpoint = endpoint;
cxlmd->depth = endpoint->depth;


---

...and here is the new regression test so I don't mess up and miss this
again:

diff --git a/cxl/memdev.c b/cxl/memdev.c
index d76a4d86a40a..81dfd4c25b25 100644
--- a/cxl/memdev.c
+++ b/cxl/memdev.c
@@ -752,6 +752,8 @@ static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx,
if (end[0] == 0)
continue;
} else {
+ unsigned long domain, bus, dev, func;
+
if (strcmp(argv[i], "all") == 0) {
argc = 1;
break;
@@ -760,6 +762,12 @@ static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx,
continue;
if (sscanf(argv[i], "%lu", &id) == 1)
continue;
+ if (sscanf(argv[i], "%lx:%lx:%lx.%lx", &domain, &bus, &dev, &func))
+ continue;
+ if (sscanf(argv[i], "cxl_mem.%lu", &id))
+ continue;
+ if (sscanf(argv[i], "cxl_rcd.%lu", &id))
+ continue;
}

log_err(&ml, "'%s' is not a valid memdev %s\n", argv[i],
diff --git a/test/cxl-topology.sh b/test/cxl-topology.sh
index 89d01a89ccb1..0320887a953b 100644
--- a/test/cxl-topology.sh
+++ b/test/cxl-topology.sh
@@ -120,6 +120,13 @@ count=$(jq "map(select(.pmem_size == $pmem_size)) | length" <<< $json)
((bridges == 2 && count == 8 || bridges == 3 && count == 10 ||
bridges == 4 && count == 11)) || err "$LINENO"

+# check rcd endpoints disappear when disabling the memdev
+m=$($CXL list -M -b cxl_test | jq -r ".[].host" | grep rcd)
+ep=$($CXL list -E -m $m | jq -r ".[].endpoint")
+$CXL disable-memdev $m --force
+check=$($CXL list -E -e $ep | jq -r ".[].endpoint")
+[ -z "$check" ] || err "$LINENO"
+$CXL enable-memdev $m