Re: INFO: task hung in iterate_supers

From: Tetsuo Handa
Date: Wed Jul 11 2018 - 06:20:08 EST


On 2018/07/10 19:34, Tetsuo Handa wrote:
> It turned out that, although the reason of stalling v9fs_mount() is currently
> unknown, the reason of many processes stuck at iterate_supers() is that
> they are unable to take s->s_umount object due to down_write_nested() below.
>
> /*
> * sget() can have s_umount recursion.
> *
> * When it cannot find a suitable sb, it allocates a new
> * one (this one), and tries again to find a suitable old
> * one.
> *
> * In case that succeeds, it will acquire the s_umount
> * lock of the old one. Since these are clearly distrinct
> * locks, and this object isn't exposed yet, there's no
> * risk of deadlocks.
> *
> * Annotate this by putting this lock in a different
> * subclass.
> */
> down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
>

Guessing from commit dabe0dc194d5d56d ("vfs: fix the rest of sget() races"),
this is an overlooked sget() race.

iterate_supers() is calling down_read(&sb_s_umount) on all superblocks found by
traversing super_blocks list. But sget_userns() adds "newly created superblock
to super_blocks list with ->s_umount held by alloc_super() for write" before
"mount_fs() sets SB_BORN flag after returning from type->mount()".

Therefore, if type->mount() (e.g. v9fs_mount()) took long after sget() for
some reason, processes will be blocked on a superblock on SB_BORN not yet set.
Doing the test (which is done after ->s_umount is held) before holding ->s_umount
(patch shown below) works for me. Although there seems to be many bugs in v9fs
code, I think that making sure that other threads won't try to wait for superblock
with SB_BORN not yet set is better.

---
fs/super.c | 31 ++++++++++++++++++++++---------
1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 50728d9..03c3b3f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,11 @@
#include <linux/user_namespace.h>
#include "internal.h"

+static inline bool sb_is_alive(struct super_block *sb)
+{
+ return sb->s_root && (sb->s_flags & SB_BORN);
+}
+
static int thaw_super_locked(struct super_block *sb);

static LIST_HEAD(super_blocks);
@@ -407,8 +412,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
bool trylock_super(struct super_block *sb)
{
if (down_read_trylock(&sb->s_umount)) {
- if (!hlist_unhashed(&sb->s_instances) &&
- sb->s_root && (sb->s_flags & SB_BORN))
+ if (!hlist_unhashed(&sb->s_instances) && sb_is_alive(sb))
return true;
up_read(&sb->s_umount);
}
@@ -590,6 +594,8 @@ static void __iterate_supers(void (*f)(struct super_block *))

spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
+ if (!sb_is_alive(sb))
+ continue;
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
@@ -620,13 +626,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)

spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
+ if (!sb_is_alive(sb))
+ continue;
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
spin_unlock(&sb_lock);

down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ if (sb_is_alive(sb))
f(sb, arg);
up_read(&sb->s_umount);

@@ -656,11 +664,13 @@ void iterate_supers_type(struct file_system_type *type,

spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+ if (!sb_is_alive(sb))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);

down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ if (sb_is_alive(sb))
f(sb, arg);
up_read(&sb->s_umount);

@@ -689,6 +699,8 @@ static struct super_block *__get_super(struct block_device *bdev, bool excl)
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
+ if (!sb_is_alive(sb))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);
if (!excl)
@@ -696,7 +708,7 @@ static struct super_block *__get_super(struct block_device *bdev, bool excl)
else
down_write(&sb->s_umount);
/* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ if (sb_is_alive(sb))
return sb;
if (!excl)
up_read(&sb->s_umount);
@@ -813,11 +825,13 @@ struct super_block *user_get_super(dev_t dev)
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_dev == dev) {
+ if (!sb_is_alive(sb))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
/* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ if (sb_is_alive(sb))
return sb;
up_read(&sb->s_umount);
/* nope, got unmounted */
@@ -916,8 +930,7 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
static void do_emergency_remount_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
- if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
- !sb_rdonly(sb)) {
+ if (sb_is_alive(sb) && sb->s_bdev && !sb_rdonly(sb)) {
/*
* What lock protects sb->s_flags??
*/
@@ -947,7 +960,7 @@ void emergency_remount(void)
static void do_thaw_all_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
- if (sb->s_root && sb->s_flags & SB_BORN) {
+ if (sb_is_alive(sb)) {
emergency_thaw_bdev(sb);
thaw_super_locked(sb);
} else {
--
1.8.3.1



----------------------------------------
// autogenerated by syzkaller (http://github.com/google/syzkaller)

#define _GNU_SOURCE
#include <fcntl.h>
#include <linux/futex.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/mount.h>
#include <signal.h>

struct thread_t {
int created, running, call;
pthread_t th;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
while (!__atomic_load_n(&th->running, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &th->running, FUTEX_WAIT, 0, 0);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
__atomic_store_n(&th->running, 0, __ATOMIC_RELEASE);
syscall(SYS_futex, &th->running, FUTEX_WAKE);
}
return 0;
}

static void execute(int num_calls)
{
int call, thread;
running = 0;
for (call = 0; call < num_calls; call++) {
for (thread = 0; thread < sizeof(threads) / sizeof(threads[0]); thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
pthread_create(&th->th, &attr, thr, th);
}
if (!__atomic_load_n(&th->running, __ATOMIC_ACQUIRE)) {
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
__atomic_store_n(&th->running, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &th->running, FUTEX_WAKE);
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 20 * 1000 * 1000;
syscall(SYS_futex, &th->running, FUTEX_WAIT, 1, &ts);
if (__atomic_load_n(&running, __ATOMIC_RELAXED))
usleep((call == num_calls - 1) ? 10000 : 1000);
break;
}
}
}
}

static void execute_call(int call)
{
static int fd[2] = { EOF, EOF };
static char buffer[128] = { };
switch (call) {
case 4:
sync();
break;
case 6:
if (pipe(fd) == EOF)
kill(0, SIGKILL);
break;
case 7:
mkdir("./file0", 0);
break;
case 8:
snprintf(buffer, sizeof(buffer) - 1, "trans=fd,rfdno=0x%x,wfdno=0x%x,version=9p2000.u,", fd[0], fd[1]);
mount(NULL, "./file0", "9p", 0, buffer);
break;
case 9:
write(fd[1], "\x79\x04\x00\x00\x7d\x01\x00\x05\x00\x58\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x23\x00\x00\x00\x00\x39\x00\x00"
"\x00\x00\x00\x10\x00\x76\x65\x72\x73\x69\x6f\x6e\x3d\x39\x70\x32"
"\x30\x30\x30\x2e\x75\x0d\x00\x77\x6c\x61\x6e\x31\x76\x62\x6f\x78"
"\x6e\x65\x74\x30\x03\x00\x39\x70\x00\x05\x00\x72\x66\x64\x6e\x6f"
"\x0c\x00\x76\x9a\xae\x6d\x5f\xff\xf0\x68\xa6\xbe\x6b\x2c\xc3\xfd"
"\x62\x6f\x78\x6e\x65\x74\x31\x28\x7d\x25\x24\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00", 135);
break;
}
}

int main(int argc, char *argv[])
{
int procid;
for (procid = 0; procid < 8; procid++)
if (fork() == 0)
while (1)
execute(10);
sleep(1000000);
return 0;
}
----------------------------------------



By the way, iterate_supers_type() was introduced in v3.1 for fs/ubifs/debug.c

/*
* TODO: this is racy - the file-system might have already been
* unmounted and we'd oops in this case. The plan is to fix it with
* help of 'iterate_supers_type()' which we should have in v3.0: when
* a debugfs opened, we rember FS's UUID in file->private_data. Then
* whenever we access the FS via a debugfs file, we iterate all UBIFS
* superblocks and fine the one with the same UUID, and take the
* locking right.
*
* The other way to go suggested by Al Viro is to create a separate
* 'ubifs-debug' file-system instead.
*/

but is still not called. Do we want to remove iterate_supers_type() ?