[patch 1/2] vfsmount_lock

From: Maneesh Soni (maneesh@in.ibm.com)
Date: Wed May 21 2003 - 04:25:02 EST

Next message: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Previous message: Mike Galbraith: "Re: web page on O(1) scheduler"
Next in thread: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Reply: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Reply: Andrew Morton: "Re: [patch 1/2] vfsmount_lock"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

While path walking we do follow_mount or follow_down which uses
dcache_lock for serialisation. vfsmount related operations also use
dcache_lock for all updates. I think we can use a separate lock for
vfsmount related work and can improve path walking.

The following two patches does the same. The first one replaces
dcache_lock with new vfsmount_lock in namespace.c. The lock is
local to namespace.c and is not required outside. The second patch
uses RCU to have lock free lookup_mnt(). The patches are quite simple
and straight forward.

The lockmeter reults show reduced contention, and lock acquisitions
for dcache_lock while running dcachebench* on a 4-way SMP box

SPINLOCKS HOLD WAIT
UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME

baselkm-2569:
20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock
mntlkm-2569:
14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock

We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way
NUMAQ while runing dcachebench*.

Average (usecs/iteration) Std. Deviation
(lower is better)
4-way SMP
2.5.69 15739.3 470.90
2.5.69-mnt 14459.6 298.51

16-way NUMAQ
2.5.69 120426.5 363.78
2.5.69-mnt 63225.8 427.60

*dcachebench is a microbenchmark written by Bill Hartner and is available at
http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html

vfsmount_lock.patch
-------------------
- Patch for replacing dcache_lock with new vfsmount_lock for all mount
related operation. This removes the need to take dcache_lock while
doing follow_mount or follow_down operations in path walking.

fs/namei.c | 24 +++++++++------------
fs/namespace.c | 64 ++++++++++++++++++++++++++++++++-------------------------
2 files changed, 48 insertions(+), 40 deletions(-)

diff -puN fs/namespace.c~vfsmount_lock fs/namespace.c
--- linux-2.5.69/fs/namespace.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namespace.c 2003-05-21 11:02:47.000000000 +0530
@@ -28,6 +28,8 @@ extern int do_remount_sb(struct super_bl
extern int __init init_rootfs(void);
extern int __init fs_subsys_init(void);

+/* spinlock for vfsmount related operation, inplace of dcache_lock */
+static spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
@@ -69,30 +71,38 @@ void free_vfsmnt(struct vfsmount *mnt)
         kmem_cache_free(mnt_cache, mnt);
}

+/*
+ * Now, lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
         struct list_head * head = mount_hashtable + hash(mnt, dentry);
         struct list_head * tmp = head;
- struct vfsmount *p;
+ struct vfsmount *p, *found = NULL;

+ spin_lock(&vfsmount_lock);
         for (;;) {
                 tmp = tmp->next;
                 p = NULL;
                 if (tmp == head)
                         break;
                 p = list_entry(tmp, struct vfsmount, mnt_hash);
- if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry)
+ if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+ found = mntget(p);
                         break;
+ }
         }
- return p;
+ spin_lock(&vfsmount_lock);
+ return found;
}

static int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         while (mnt->mnt_parent != mnt)
                 mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         return mnt == current->namespace->root;
}

@@ -273,15 +283,15 @@ void umount_tree(struct vfsmount *mnt)
                 mnt = list_entry(kill.next, struct vfsmount, mnt_list);
                 list_del_init(&mnt->mnt_list);
                 if (mnt->mnt_parent == mnt) {
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
                 } else {
                         struct nameidata old_nd;
                         detach_mnt(mnt, &old_nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
                         path_release(&old_nd);
                 }
                 mntput(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         }
}

@@ -334,17 +344,17 @@ static int do_umount(struct vfsmount *mn
         }

         down_write(&current->namespace->sem);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);

         if (atomic_read(&sb->s_active) == 1) {
                 /* last instance - try to be smart */
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
                 lock_kernel();
                 DQUOT_OFF(sb);
                 acct_auto_close(sb);
                 unlock_kernel();
                 security_sb_umount_close(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         }
         retval = -EBUSY;
         if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
@@ -352,7 +362,7 @@ static int do_umount(struct vfsmount *mn
                         umount_tree(mnt);
                 retval = 0;
         }
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         if (retval)
                 security_sb_umount_busy(mnt);
         up_write(&current->namespace->sem);
@@ -442,17 +452,17 @@ static struct vfsmount *copy_tree(struct
                 q = clone_mnt(p, p->mnt_root);
                 if (!q)
                         goto Enomem;
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
                 list_add_tail(&q->mnt_list, &res->mnt_list);
                 attach_mnt(q, &nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         }
         return res;
Enomem:
         if (res) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
                 umount_tree(res);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         }
         return NULL;
}
@@ -476,7 +486,7 @@ static int graft_tree(struct vfsmount *m
         if (err)
                 goto out_unlock;

- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
                 struct list_head head;
                 attach_mnt(mnt, nd);
@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *m
                 mntget(mnt);
                 err = 0;
         }
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out_unlock:
         up(&nd->dentry->d_inode->i_sem);
         if (!err)
@@ -522,9 +532,9 @@ static int do_loopback(struct nameidata
         if (mnt) {
                 err = graft_tree(mnt, nd);
                 if (err) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
                         umount_tree(mnt);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
                 } else
                         mntput(mnt);
         }
@@ -589,7 +599,7 @@ static int do_move_mount(struct nameidat
         if (IS_DEADDIR(nd->dentry->d_inode))
                 goto out1;

- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
                 goto out2;

@@ -613,7 +623,7 @@ static int do_move_mount(struct nameidat
         detach_mnt(old_nd.mnt, &parent_nd);
         attach_mnt(old_nd.mnt, nd);
out2:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out1:
         up(&nd->dentry->d_inode->i_sem);
out:
@@ -794,9 +804,9 @@ int copy_namespace(int flags, struct tas
         down_write(&tsk->namespace->sem);
         /* First pass: copy the tree topology */
         new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);

         /* Second pass: switch the tsk->fs->* elements */
         if (fs) {
@@ -1017,7 +1027,7 @@ asmlinkage long sys_pivot_root(const cha
         if (new_nd.mnt->mnt_root != new_nd.dentry)
                 goto out2; /* not a mountpoint */
         tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
         if (tmp != new_nd.mnt) {
                 for (;;) {
                         if (tmp->mnt_parent == tmp)
@@ -1034,7 +1044,7 @@ asmlinkage long sys_pivot_root(const cha
         detach_mnt(user_nd.mnt, &root_parent);
         attach_mnt(user_nd.mnt, &old_nd);
         attach_mnt(new_nd.mnt, &root_parent);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         chroot_fs_refs(&user_nd, &new_nd);
         security_sb_post_pivotroot(&user_nd, &new_nd);
         error = 0;
@@ -1051,7 +1061,7 @@ out0:
         unlock_kernel();
         return error;
out3:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
         goto out2;
}

diff -puN fs/namei.c~vfsmount_lock fs/namei.c
--- linux-2.5.69/fs/namei.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namei.c 2003-05-21 10:53:00.000000000 +0530
@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str
         return 1;
}

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
         int res = 0;
         while (d_mountpoint(*dentry)) {
- struct vfsmount *mounted;
- spin_lock(&dcache_lock);
- mounted = lookup_mnt(*mnt, *dentry);
- if (!mounted) {
- spin_unlock(&dcache_lock);
+ struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+ if (!mounted)
                         break;
- }
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
                 dput(*dentry);
                 mntput(mounted->mnt_parent);
                 *dentry = dget(mounted->mnt_root);
@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount
         return res;
}

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
{
         struct vfsmount *mounted;
-
- spin_lock(&dcache_lock);
+
         mounted = lookup_mnt(*mnt, *dentry);
         if (mounted) {
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
                 dput(*dentry);
                 mntput(mounted->mnt_parent);
                 *dentry = dget(mounted->mnt_root);
                 return 1;
         }
- spin_unlock(&dcache_lock);
         return 0;
}

-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Next message: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Previous message: Mike Galbraith: "Re: web page on O(1) scheduler"
Next in thread: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Reply: Maneesh Soni: "Re: [patch 2/2] lockfree lookup_mnt"
Reply: Andrew Morton: "Re: [patch 1/2] vfsmount_lock"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Fri May 23 2003 - 22:00:43 EST