Re: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount

From: Christian Brauner
Date: Thu Apr 17 2025 - 12:28:38 EST


On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > > So if there's some userspace process with a broken NFS server and it
> > > does umount(MNT_DETACH) it will end up hanging every other
> > > umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > > workqueue (to my understanding) cannot make progress.
> >
> > Ok, "to my understanding" has been updated after going back and reading
> > the delayed work code. Luckily it's not as bad as I thought it is
> > because it's queued on system_wq which is multi-threaded so it's at
> > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > skeptical how safe this all is.
>
> I would (again) throw system_unbound_wq into the game because the former
> will remain on the CPU on which has been enqueued (if speaking about
> multi threading).

Yes, good point.

However, what about using polled grace periods?

A first simple-minded thing to do would be to record the grace period
after umount_tree() has finished and the check it in namespace_unlock():

diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..1e7ebcdd1ebc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
+static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
@@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+ unsigned long unmount_seq = rcu_unmount_seq;
LIST_HEAD(list);

hlist_move_list(&unmounted, &head);
@@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;

- synchronize_rcu_expedited();
+ cond_synchronize_rcu_expedited(unmount_seq);

hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
hlist_del(&m->mnt_umount);
@@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
*/
mnt_notify_add(p);
}
+
+ rcu_unmount_seq = get_state_synchronize_rcu();
}

static void shrink_submounts(struct mount *mnt);


I'm not sure how much that would buy us. If it doesn't then it should be
possible to play with the following possibly strange idea:

diff --git a/fs/mount.h b/fs/mount.h
index 7aecf2a60472..51b86300dc50 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -61,6 +61,7 @@ struct mount {
struct rb_node mnt_node; /* node in the ns->mounts rbtree */
struct rcu_head mnt_rcu;
struct llist_node mnt_llist;
+ unsigned long mnt_rcu_unmount_seq;
};
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..aae9df75beed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+ bool needs_synchronize_rcu = false;
LIST_HEAD(list);

hlist_move_list(&unmounted, &head);
@@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;

- synchronize_rcu_expedited();
+ hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+ if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
+ continue;
+
+ needs_synchronize_rcu = true;
+ break;
+ }
+
+ if (needs_synchronize_rcu)
+ synchronize_rcu_expedited();

hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
hlist_del(&m->mnt_umount);
@@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
}
}
change_mnt_propagation(p, MS_PRIVATE);
- if (disconnect)
+ if (disconnect) {
+ p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
hlist_add_head(&p->mnt_umount, &unmounted);
+ }

/*
* At this point p->mnt_ns is NULL, notification will be queued

This would allow to elide synchronize rcu calls if they elapsed in the
meantime since we moved that mount to the unmounted list.