Re: fs: NULL deref in atime_needs_update

From: Al Viro
Date: Thu Feb 25 2016 - 11:39:37 EST


On Thu, Feb 25, 2016 at 09:29:21AM +0100, Dmitry Vyukov wrote:

> Humm... I've left it running over night but no GPFs happened...
> Usually they happened within two hours or so. I would think that your
> patch fixes it and I did not actually apply it last time (or did not
> rebuild kernel). But I saw the new warnings that the patch adds, so I
> should have been rebuilt it...
>
> What I saw is a dozen of pairs of warnings like the one below.
> Is it possible the warning printing introduces enough delay to close
> the inconsistency window?....

All that stops these warnings from triggering atime_... oopsen is that dentry
involved isn't a symlink one. lookup_fast() should *not* return 0 and set
*inode to NULL. Ever. Trigger the same in walk_component() and you'll
get NULL nd->inode, which will oops as soon as you get to may_lookup() for
the next component (or atime one if dentry turns out to be a symlink and
not a directory).

IOW, what happened is that you've got dozens of instances of the underlying
bug triggered, all on non-symlink dentries in the last components of pathname.

The codepath in question is this:
*inode = d_backing_inode(dentry);
negative = d_is_negative(dentry);
if (read_seqcount_retry(&dentry->d_seq, seq))
return -ECHILD;
at that point we'd better have negative and *inode refering to the state of
dentry at the same moment - seq had been fetched before both the inode and
dentry flags and has remained unchanged until the later point, i.e. through
all the interval containing both fetches.
....
if (negative)
return -ENOENT;
no *inode changes in between, so it ought to be non-NULL
path->mnt = mnt;
path->dentry = dentry;
if (likely(__follow_mount_rcu(nd, path, inode, seqp))) {
WARN_ON(!*inode);
return 0;
and you are triggering that WARN_ON. So either __follow_mount_rcu() has
returned true and zeroed *inode, or we have something very wrong with
->d_seq.

__follow_mount_rcu() reassigns *inode only in one place:
mounted = __lookup_mnt(path->mnt, path->dentry);
if (!mounted)
break;
path->mnt = &mounted->mnt;
path->dentry = mounted->mnt.mnt_root;
nd->flags |= LOOKUP_JUMPED;
*seqp = read_seqcount_begin(&path->dentry->d_seq);
/*
* Update the inode too. We don't need to re-check the
* dentry sequence number here after this d_inode read,
* because a mount-point is always pinned.
*/
*inode = path->dentry->d_inode;
Note that it had returned true, so we have read_seqretry(&mount_lock, nd->m_seq)
yielding false, i.e. mount_lock hadn't been touched through all of that.
Having ->mnt_root->d_inode go NULL on a live vfsmount is a Bad Thing(tm), for
obvious reasons. ->mnt_root should've remained pinned until cleanup_mnt(),
which means that it must've already gotten through mntput_no_expire()
lock_mount_hash/unlock_mount_hash *before* we'd picked nd->m_seq; otherwise
we'd see mount_lock mismatch. Now, all removals from vfsmount hash should
happen under mount_lock and prior to cleanup_mnt() on victim. So we should've
had this:
Somebody:
hlist_del_init(&mounted->mnt_hash);
...
write_sequnlock(&mount_lock);
Us:
rcu_read_lock();
nd->m_seq = read_seqbegin(&mount_lock);
...
hlist_for_each_entry_rcu(p, head, mnt_hash)
... run into 'mounted'
find mount_lock untouched.

AFAICS, write_sequnlock/read_seqbegin barriers should've sufficed to
prevent that...

Hrm... OK, seeing that you still seem to trigger those within an hour or
two (and *any* of remaining WARN_ON() are serious bugs - none of the
"mitigation had been triggered" remained, sorry for not making it clear),
let's try this. Again, any WARN_ON triggered means that we'd caught something,
whether it progresses into oops or not.

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3d..86f81e3 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -323,6 +323,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
struct dentry *new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
+ WARN_ON(d_is_negative(new));
ino = autofs4_dentry_ino(new);
ino->last_used = jiffies;
dput(path->dentry);
diff --git a/fs/namei.c b/fs/namei.c
index f624d13..daa6b25 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1209,6 +1209,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
/* Handle an automount point */
if (managed & DCACHE_NEED_AUTOMOUNT) {
ret = follow_automount(path, nd, &need_mntput);
+ WARN_ON(d_is_negative(path->dentry));
if (ret < 0)
break;
continue;
@@ -1260,6 +1261,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
{
for (;;) {
struct mount *mounted;
+ void *p;
/*
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
@@ -1289,7 +1291,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* dentry sequence number here after this d_inode read,
* because a mount-point is always pinned.
*/
- *inode = path->dentry->d_inode;
+ p = *inode = path->dentry->d_inode;
+ if (unlikely(!p))
+ WARN_ON(!read_seqretry(&mount_lock, nd->m_seq));
}
return !read_seqretry(&mount_lock, nd->m_seq) &&
!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
@@ -1580,10 +1584,12 @@ static int lookup_fast(struct nameidata *nd,
*/
if (negative)
return -ENOENT;
+ WARN_ON(!*inode); // ->d_seq was fucked somehow
path->mnt = mnt;
path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ if (likely(__follow_mount_rcu(nd, path, inode, seqp))) {
return 0;
+ }
unlazy:
if (unlazy_walk(nd, dentry, seq))
return -ECHILD;
@@ -1613,8 +1619,10 @@ unlazy:
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd);
- if (likely(!err))
+ if (likely(!err)) {
*inode = d_backing_inode(path->dentry);
+ WARN_ON(!*inode);
+ }
return err;

need_lookup:
@@ -1712,6 +1720,12 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
return 0;
if (!follow)
return 0;
+ /* make sure that d_is_symlink above matches inode */
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ return -ECHILD;
+ }
+ WARN_ON(!inode); // now, _that_ should not happen.
return pick_link(nd, link, inode, seq);
}

@@ -1743,11 +1757,11 @@ static int walk_component(struct nameidata *nd, int flags)
if (err < 0)
return err;

- inode = d_backing_inode(path.dentry);
seq = 0; /* we are already out of RCU mode */
err = -ENOENT;
if (d_is_negative(path.dentry))
goto out_path_put;
+ inode = d_backing_inode(path.dentry);
}

if (flags & WALK_PUT)
@@ -3106,8 +3120,10 @@ static int do_last(struct nameidata *nd,
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, &path, &inode, &seq);
- if (likely(!error))
+ if (likely(!error)) {
+ WARN_ON(!inode);
goto finish_lookup;
+ }

if (error < 0)
return error;
@@ -3192,12 +3208,13 @@ retry_lookup:
return error;

BUG_ON(nd->flags & LOOKUP_RCU);
- inode = d_backing_inode(path.dentry);
seq = 0; /* out of RCU mode, so the value doesn't matter */
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
+ inode = d_backing_inode(path.dentry);
+ WARN_ON(!inode);
finish_lookup:
if (nd->depth)
put_link(nd);
@@ -3206,11 +3223,6 @@ finish_lookup:
if (unlikely(error))
return error;

- if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
- path_to_nameidata(&path, nd);
- return -ELOOP;
- }
-
if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
path_to_nameidata(&path, nd);
} else {
@@ -3229,6 +3241,10 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
+ if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+ error = -ELOOP;
+ goto out;
+ }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3273,6 +3289,10 @@ opened:
goto exit_fput;
}
out:
+ if (unlikely(error > 0)) {
+ WARN_ON(1);
+ error = -EINVAL;
+ }
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691..4128a5c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1060,6 +1060,8 @@ static void cleanup_mnt(struct mount *mnt)
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
+ WARN_ON(!mnt->mnt.mnt_root->d_inode); // some joker has managed to
+ // make mnt_root negative on us
if (unlikely(mnt->mnt_pins.first))
mnt_pin_kill(mnt);
fsnotify_vfsmount_delete(&mnt->mnt);