[PATCH 1/2] batch-write.patch

From: Hans Reiser
Date: Thu Jun 29 2006 - 15:16:04 EST





This patch adds a method batch_write to struct address_space_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.

diff -puN mm/filemap.c~batched-write mm/filemap.c

diff -puN include/linux/fs.h~batched-write include/linux/fs.h



diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm3/include/linux/fs.h~batched-write 2006-06-28 21:39:27.000000000 +0400
+++ linux-2.6.17-mm3-root/include/linux/fs.h 2006-06-28 21:39:27.000000000 +0400
@@ -346,6 +346,39 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+typedef struct write_descriptor write_descriptor_t;
+
+/*
+ * This is "write_actor" function type, used by write() to copy data from user
+ * space. There are two functions of this type: write_actor and
+ * write_iovec_actor. First is used when user data are in one segment, second
+ * is used in case of vectored write.
+ */
+typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
+ const write_descriptor_t *);
+
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @cur_iov: current i/o vector
+ * @iov_off: offset within current i/o vector
+ * @buf: user data
+ * @actor: function to copy user data with
+ *
+ * This structure is to pass to batch_write address space operation all
+ * information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ const struct iovec *cur_iov;
+ size_t iov_off;
+ char __user *buf;
+ write_actor_t actor;
+};
+
+#include <linux/pagevec.h>

struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -367,6 +400,8 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*batch_write)(struct file *, const write_descriptor_t *,
+ struct pagevec *, struct page **, size_t *);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm3/mm/filemap.c~batched-write 2006-06-28 21:39:27.000000000 +0400
+++ linux-2.6.17-mm3-root/mm/filemap.c 2006-06-28 22:03:59.000000000 +0400
@@ -2160,72 +2160,101 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);

-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * write_actor - copy data from user buffer
+ * @page: the page to copy data to
+ * @offset: offset within the page
+ * @bytes: number of bytes to copy
+ * @desc: pointer to user buffer is obtained from here
+ *
+ * This is used to copy data from user buffer into @page in case of i/o vector
+ * has 1 segment. In case of write, in short.
+ */
+static size_t write_actor(struct page *page, unsigned long offset,
+ size_t bytes, const write_descriptor_t *desc)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
+ return filemap_copy_from_user(page, offset, desc->buf, bytes);
+}

- pagevec_init(&lru_pvec, 0);
+/**
+ * write_iovec_actor - copy data from i/o vector
+ * @page: the page to copy data to
+ * @offset: offset within the page
+ * @bytes: number of bytes to copy
+ * @desc: current iovec and offset in it are obtained from here
+ *
+ * This is used to copy data from user buffer into @page in case of i/o vector
+ * has more than segment. In case of writev, in short.
+ */
+static size_t write_iovec_actor(struct page *page, unsigned long offset,
+ size_t bytes, const write_descriptor_t *desc)
+{
+ return filemap_copy_from_user_iovec(page, offset, desc->cur_iov,
+ desc->iov_off, bytes);
+}

- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
+/**
+ * generic_batch_write - generic implementation of batched write
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write method writes not more than one page of
+ * file. It faults in user space, allocates page and calls prepare_write and
+ * commit_write address space operations. User data are copied by an actor
+ * which is set by caller depending on whether write or writev is on the way.
+ */
+static long generic_batch_write(struct file *file,
+ const write_descriptor_t *desc,
+ struct pagevec *lru_pvec,
+ struct page **cached_page, size_t *written)
+{
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));

- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;

- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;

- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ /* Limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);

+ /*
+ * Limit the size of the copy to that of the current segment,
+ * because fault_in_pages_readable() doesn't know how to walk
+ * segments.
+ */
+ bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
+
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ fault_in_pages_readable(desc->buf, bytes);

- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ page = __grab_cache_page(file->f_mapping, index, cached_page,
+ lru_pvec);
+ if (!page)
+ return -ENOMEM;

- status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ status = a_ops->prepare_write(file, page, offset,
+ offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);

if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2233,57 +2262,127 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /*
+ * call write actor in order to copy user data to the
+ * page
+ */
+ *written = desc->actor(page, offset, bytes, desc);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
- if (likely(copied > 0)) {
- if (!status)
- status = copied;

- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
- filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero
+ * number of written bytes. If write_actor copied less than it
+ * was asked to we return -EFAULT and number of bytes
+ * actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct page *cached_page = NULL;
+ struct pagevec lru_pvec;
+ write_descriptor_t desc;
+ size_t copied = 0;
+
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.count = count;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1)) {
+ desc.cur_iov = iov;
+ desc.iov_off = written;
+ desc.actor = write_actor;
+ } else {
+ filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
+ desc.actor = write_iovec_actor;
+ }
+ /* pointer to user buffer */
+ desc.buf = desc.cur_iov->iov_base + desc.iov_off;
+
+ do {
+ /*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3)
+ * costs more than 1) and 2) for sophisticated filesystems. To
+ * allow the FS to choose to pay the cost of 3) only once we
+ * call batch_write, if the FS supports it.
+ */
+ if (a_ops->batch_write)
+ status = a_ops->batch_write(file, &desc, &lru_pvec,
+ &cached_page, &copied);
+ else
+ status = generic_batch_write(file, &desc, &lru_pvec,
+ &cached_page, &copied);
+ if (likely(copied > 0)) {
+ written += copied;
+ desc.count -= copied;
+ if (desc.count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (unlikely(nr_segs > 1))
+ filemap_set_next_iovec(&desc.cur_iov,
+ &desc.iov_off,
+ copied);
+ else
+ desc.iov_off += copied;
+ desc.buf = desc.cur_iov->iov_base +
+ desc.iov_off;
+ }
+ }
if (status < 0)
break;
balance_dirty_pages_ratelimited(mapping);
cond_resched();
- } while (count);
- *ppos = pos;
+ } while (desc.count);
+ *ppos = pos + written;

if (cached_page)
page_cache_release(cached_page);

_