Re: dcache shrink list corruption?

From: Al Viro
Date: Tue Apr 29 2014 - 22:31:51 EST


On Wed, Apr 30, 2014 at 12:20:13AM +0100, Al Viro wrote:
> On Tue, Apr 29, 2014 at 04:04:11PM -0700, Linus Torvalds wrote:
> > But at a minimum, we have "d_op->d_prune()" that would now be possibly
> > be called for the old dentry *after* a new dentry has been allocated.
> > Not to mention the inode not having been dropped. So it looks like a
> > disaster where the filesystem now ends up having concurrent "live"
> > dentries for the same entry. Even if one of them is on its way out,
> > it's still visible to the filesystem. That makes me very
> > uncomfortable.
> >
> > I'm starting to think that Miklos' original patch (perhaps with the
> > spinlock split to at least be one per superblock) is the most
> > straightforward approach after all. It's annoying having to add locks
> > here, but the whole pruning path should not be a hotpath, so maybe
> > it's not actually a big deal.
>
> FWIW, my solution is more or less working; I'll give more local beating
> and post it...

OK, aggregate diff follows, more readable splitup (3 commits) attached.
It seems to survive beating here; testing, review and comments are
welcome.

diff --git a/fs/dcache.c b/fs/dcache.c
index 494a9def..a83b933 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}

-/*
- * no locks, please.
- */
-static void d_free(struct dentry *dentry)
+static void dentry_free(struct dentry *dentry)
{
- BUG_ON((int)dentry->d_lockref.count > 0);
- this_cpu_dec(nr_dentry);
- if (dentry->d_op && dentry->d_op->d_release)
- dentry->d_op->d_release(dentry);
-
/* if dentry was never visible to RCU, immediate free is OK */
if (!(dentry->d_flags & DCACHE_RCUACCESS))
__d_free(&dentry->d_u.d_rcu);
@@ -420,40 +412,6 @@ static void dentry_lru_del(struct dentry *dentry)
}

/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
- */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
- __releases(dentry->d_lock)
- __releases(parent->d_lock)
- __releases(dentry->d_inode->i_lock)
-{
- list_del(&dentry->d_u.d_child);
- /*
- * Inform d_walk() that we are no longer attached to the
- * dentry tree
- */
- dentry->d_flags |= DCACHE_DENTRY_KILLED;
- if (parent)
- spin_unlock(&parent->d_lock);
- dentry_iput(dentry);
- /*
- * dentry_iput drops the locks, at which point nobody (except
- * transient RCU lookups) can reach this dentry.
- */
- d_free(dentry);
- return parent;
-}
-
-/**
* d_drop - drop a dentry
* @dentry: dentry to drop
*
@@ -499,6 +457,8 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);

+static DECLARE_WAIT_QUEUE_HEAD(free_wq);
+
/*
* Finish off a dentry we've decided to kill.
* dentry->d_lock must be held, returns with it unlocked.
@@ -506,16 +466,17 @@ EXPORT_SYMBOL(d_drop);
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
+dentry_kill(struct dentry *dentry, struct list_head *morgue)
__releases(dentry->d_lock)
{
struct inode *inode;
struct dentry *parent;
+ bool can_free = true;

inode = dentry->d_inode;
if (inode && !spin_trylock(&inode->i_lock)) {
relock:
- if (unlock_on_failure) {
+ if (!morgue) {
spin_unlock(&dentry->d_lock);
cpu_relax();
}
@@ -531,6 +492,15 @@ relock:
goto relock;
}

+ if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+ /* morgue must be non-NULL */
+ list_move(&dentry->d_lru, morgue);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ /* inode must be NULL */
+ spin_unlock(&dentry->d_lock);
+ return parent;
+ }
/*
* The dentry is now unrecoverably dead to the world.
*/
@@ -543,10 +513,43 @@ relock:
if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);

- dentry_lru_del(dentry);
+ if (dentry->d_flags & DCACHE_LRU_LIST) {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+ d_lru_del(dentry);
+ else if (morgue)
+ d_shrink_del(dentry);
+ else
+ can_free = false;
+ }
/* if it was on the hash then remove it */
__d_drop(dentry);
- return d_kill(dentry, parent);
+ list_del(&dentry->d_u.d_child);
+ /*
+ * Inform d_walk() that we are no longer attached to the
+ * dentry tree
+ */
+ dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ dentry_iput(dentry);
+ /*
+ * dentry_iput drops the locks, at which point nobody (except
+ * transient RCU lookups) can reach this dentry.
+ */
+ BUG_ON((int)dentry->d_lockref.count > 0);
+ this_cpu_dec(nr_dentry);
+ if (dentry->d_op && dentry->d_op->d_release)
+ dentry->d_op->d_release(dentry);
+
+ if (likely(can_free)) {
+ dentry_free(dentry);
+ } else {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_MAY_FREE;
+ spin_unlock(&dentry->d_lock);
+ wake_up(&free_wq);
+ }
+ return parent;
}

/*
@@ -602,7 +605,7 @@ repeat:
return;

kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry, NULL);
if (dentry)
goto repeat;
}
@@ -815,47 +818,10 @@ restart:
}
EXPORT_SYMBOL(d_prune_aliases);

-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static struct dentry * try_prune_one_dentry(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct dentry *parent;
-
- parent = dentry_kill(dentry, 0);
- /*
- * If dentry_kill returns NULL, we have nothing more to do.
- * if it returns the same dentry, trylocks failed. In either
- * case, just loop again.
- *
- * Otherwise, we need to prune ancestors too. This is necessary
- * to prevent quadratic behavior of shrink_dcache_parent(), but
- * is also expected to be beneficial in reducing dentry cache
- * fragmentation.
- */
- if (!parent)
- return NULL;
- if (parent == dentry)
- return dentry;
-
- /* Prune ancestors. */
- dentry = parent;
- while (dentry) {
- if (lockref_put_or_lock(&dentry->d_lockref))
- return NULL;
- dentry = dentry_kill(dentry, 1);
- }
- return NULL;
-}
-
static void shrink_dentry_list(struct list_head *list)
{
- struct dentry *dentry;
+ struct dentry *dentry, *parent;
+ LIST_HEAD(morgue);

rcu_read_lock();
for (;;) {
@@ -891,24 +857,43 @@ static void shrink_dentry_list(struct list_head *list)
}
rcu_read_unlock();

+ parent = dentry_kill(dentry, &morgue);
/*
- * If 'try_to_prune()' returns a dentry, it will
- * be the same one we passed in, and d_lock will
- * have been held the whole time, so it will not
- * have been added to any other lists. We failed
- * to get the inode lock.
- *
- * We just add it back to the shrink list.
+ * If dentry_kill returns NULL, we have nothing more to do.
*/
- dentry = try_prune_one_dentry(dentry);
-
- rcu_read_lock();
- if (dentry) {
+ if (!parent) {
+ rcu_read_lock();
+ continue;
+ }
+ if (unlikely(parent == dentry)) {
+ /*
+ * trylocks have failed and d_lock has been held the
+ * whole time, so it could not have been added to any
+ * other lists. Just add it back to the shrink list.
+ */
+ rcu_read_lock();
d_shrink_add(dentry, list);
spin_unlock(&dentry->d_lock);
+ continue;
}
+ /*
+ * We need to prune ancestors too. This is necessary to prevent
+ * quadratic behavior of shrink_dcache_parent(), but is also
+ * expected to be beneficial in reducing dentry cache
+ * fragmentation.
+ */
+ dentry = parent;
+ while (dentry && !lockref_put_or_lock(&dentry->d_lockref))
+ dentry = dentry_kill(dentry, NULL);
+ rcu_read_lock();
}
rcu_read_unlock();
+ while (unlikely(!list_empty(&morgue))) {
+ dentry = list_first_entry(&morgue, struct dentry, d_lru);
+ list_del_init(&dentry->d_lru);
+ wait_event(free_wq, dentry->d_flags & DCACHE_MAY_FREE);
+ dentry_free(dentry);
+ }
}

static enum lru_status
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3b9bfdb..3c7ec32 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -221,6 +221,8 @@ struct dentry_operations {
#define DCACHE_SYMLINK_TYPE 0x00300000 /* Symlink */
#define DCACHE_FILE_TYPE 0x00400000 /* Other file type */

+#define DCACHE_MAY_FREE 0x00800000
+
extern seqlock_t rename_lock;

static inline int dname_external(const struct dentry *dentry)