[PATCH 3/3] blockdev: turn a rw semaphore into a percpu rw semaphore

From: Mikulas Patocka
Date: Sat Jul 28 2012 - 12:42:38 EST


blockdev: turn a rw semaphore into a percpu rw semaphore

This avoids cache line bouncing when many processes lock the semaphore
for read.

Partially based on a patch by Jeff Moyer <jmoyer@xxxxxxxxxx>.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
fs/block_dev.c | 30 ++++++++++++++++++++----------
include/linux/fs.h | 3 ++-
2 files changed, 22 insertions(+), 11 deletions(-)

Index: linux-3.5-fast/fs/block_dev.c
===================================================================
--- linux-3.5-fast.orig/fs/block_dev.c 2012-07-28 18:32:10.000000000 +0200
+++ linux-3.5-fast/fs/block_dev.c 2012-07-28 18:32:12.000000000 +0200
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *b
return -EINVAL;

/* Prevent starting I/O or mapping the device */
- down_write(&bdev->bd_block_size_semaphore);
+ percpu_down_write(&bdev->bd_block_size_semaphore);

/* Check that the block device is not memory mapped */
mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *b
if (!prio_tree_empty(&mapping->i_mmap) ||
!list_empty(&mapping->i_mmap_nonlinear)) {
mutex_unlock(&mapping->i_mmap_mutex);
- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
return -EBUSY;
}
mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *b
kill_bdev(bdev);
}

- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);

return 0;
}
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(st
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
+
+ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+ kmem_cache_free(bdev_cachep, ei);
+ return NULL;
+ }
+
return &ei->vfs_inode;
}

@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_h
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);

+ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
+
kmem_cache_free(bdev_cachep, bdi);
}

@@ -491,7 +499,6 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex);
- init_rwsem(&bdev->bd_block_size_semaphore);
}

static inline void __bd_forget(struct inode *inode)
@@ -1592,12 +1599,13 @@ ssize_t blkdev_aio_read(struct kiocb *io
{
ssize_t ret;
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ percpu_rwsem_ptr p;

- down_read(&bdev->bd_block_size_semaphore);
+ p = percpu_down_read(&bdev->bd_block_size_semaphore);

ret = generic_file_aio_read(iocb, iov, nr_segs, pos);

- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore, p);

return ret;
}
@@ -1616,10 +1624,11 @@ ssize_t blkdev_aio_write(struct kiocb *i
struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
ssize_t ret;
+ percpu_rwsem_ptr p;

BUG_ON(iocb->ki_pos != pos);

- down_read(&bdev->bd_block_size_semaphore);
+ p = percpu_down_read(&bdev->bd_block_size_semaphore);

ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1630,7 +1639,7 @@ ssize_t blkdev_aio_write(struct kiocb *i
ret = err;
}

- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore, p);

return ret;
}
@@ -1640,12 +1649,13 @@ int blkdev_mmap(struct file *file, struc
{
int ret;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ percpu_rwsem_ptr p;

- down_read(&bdev->bd_block_size_semaphore);
+ p = percpu_down_read(&bdev->bd_block_size_semaphore);

ret = generic_file_mmap(file, vma);

- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore, p);

return ret;
}
Index: linux-3.5-fast/include/linux/fs.h
===================================================================
--- linux-3.5-fast.orig/include/linux/fs.h 2012-07-28 18:32:10.000000000 +0200
+++ linux-3.5-fast/include/linux/fs.h 2012-07-28 18:32:12.000000000 +0200
@@ -10,6 +10,7 @@
#include <linux/ioctl.h>
#include <linux/blk_types.h>
#include <linux/types.h>
+#include <linux/percpu-rwsem.h>

/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -714,7 +715,7 @@ struct block_device {
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
/* A semaphore that prevents I/O while block size is being changed */
- struct rw_semaphore bd_block_size_semaphore;
+ struct percpu_rw_semaphore bd_block_size_semaphore;
};

/*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/