[RFC] [PATCH] 2.5.35 patch for making DIO async

From: Badari Pulavarty (pbadari@us.ibm.com)
Date: Tue Sep 17 2002 - 16:03:02 EST


Hi Ben,

Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
wait for io completion. Waiting will be done at the higher level
(same way generic_file_read does).

Here is what I did:

1) pass kiocb *iocb to DIO
2) allocated a "dio" (instead of using stack)
3) after submitting bios, dio code returns with -EIOCBQUEUED
4) removed dio_bio_end_io(), dio_await_one(), dio_await_completion()
5) added dio_bio_end_aio() which calls dio_bio_complete() for
   each bio and if the dio_biocount becomes 0, it calls aio_complete()
6) changed raw_read/raw_write/.. routines to pass a sync_cb and wait
   on it.

Any feedback on approach/patch is appreciated.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.35/drivers/char/raw.c linux-2.5.35.aio/drivers/char/raw.c
--- linux-2.5.35/drivers/char/raw.c Sun Sep 15 19:18:37 2002
+++ linux-2.5.35.aio/drivers/char/raw.c Tue Sep 17 09:00:39 2002
@@ -201,8 +201,9 @@
 }
 
 static ssize_t
-rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+rw_raw_aio_dev(int rw, struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
+ struct file *filp = iocb->ki_filp;
         const int minor = minor(filp->f_dentry->d_inode->i_rdev);
         struct block_device *bdev = raw_devices[minor].binding;
         struct inode *inode = bdev->bd_inode;
@@ -222,7 +223,7 @@
                 count = inode->i_size - *offp;
                 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
         }
- ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
+ ret = generic_file_direct_IO(rw, iocb, inode, iov, *offp, nr_segs);
 
         if (ret > 0)
                 *offp += ret;
@@ -231,6 +232,19 @@
 }
 
 static ssize_t
+rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ ret = rw_raw_aio_dev(rw, &kiocb, iov, nr_segs, offp);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ return ret;
+}
+
+static ssize_t
 raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
 {
         struct iovec local_iov = { .iov_base = buf, .iov_len = size};
@@ -246,6 +260,21 @@
         return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
 }
 
+static ssize_t
+raw_aio_read(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+ struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+ return rw_raw_aio_dev(READ, iocb, &local_iov, 1, offp);
+}
+
+static ssize_t
+raw_aio_write(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+ struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+ return rw_raw_aio_dev(WRITE, iocb, &local_iov, 1, offp);
+}
 static ssize_t
 raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
@@ -260,7 +289,9 @@
 
 static struct file_operations raw_fops = {
         .read = raw_read,
+ .aio_read= raw_aio_read,
         .write = raw_write,
+ .aio_write= raw_aio_write,
         .open = raw_open,
         .release= raw_release,
         .ioctl = raw_ioctl,
diff -Naur -X dontdiff linux-2.5.35/fs/block_dev.c linux-2.5.35.aio/fs/block_dev.c
--- linux-2.5.35/fs/block_dev.c Sun Sep 15 19:19:09 2002
+++ linux-2.5.35.aio/fs/block_dev.c Tue Sep 17 09:00:26 2002
@@ -116,10 +116,10 @@
 }
 
 static int
-blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
- return generic_direct_IO(rw, inode, iov, offset,
+ return generic_direct_IO(rw, iocb, inode, iov, offset,
                                 nr_segs, blkdev_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/fs/direct-io.c linux-2.5.35.aio/fs/direct-io.c
--- linux-2.5.35/fs/direct-io.c Sun Sep 15 19:18:30 2002
+++ linux-2.5.35.aio/fs/direct-io.c Tue Sep 17 09:48:16 2002
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 
 /*
@@ -68,6 +69,8 @@
         spinlock_t bio_list_lock; /* protects bio_list */
         struct bio *bio_list; /* singly linked via bi_private */
         struct task_struct *waiter; /* waiting task (NULL if none) */
+ struct kiocb *iocb; /* iocb ptr */
+ int results;
 };
 
 /*
@@ -145,25 +148,41 @@
 }
 
 /*
- * The BIO completion handler simply queues the BIO up for the process-context
- * handler.
- *
- * During I/O bi_private points at the dio. After I/O, bi_private is used to
- * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ * Process one completed BIO. No locks are held.
  */
-static void dio_bio_end_io(struct bio *bio)
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int page_no;
+
+ for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+ struct page *page = bvec[page_no].bv_page;
+
+ if (dio->rw == READ)
+ set_page_dirty(page);
+ page_cache_release(page);
+ }
+ atomic_dec(&dio->bio_count);
+ bio_put(bio);
+ return uptodate ? 0 : -EIO;
+}
+
+static void dio_bio_end_aio(struct bio *bio)
+{
+ int ret;
         struct dio *dio = bio->bi_private;
- unsigned long flags;
+ ret = dio_bio_complete(dio, bio);
+ if (ret)
+ dio->results = ret;
 
- spin_lock_irqsave(&dio->bio_list_lock, flags);
- bio->bi_private = dio->bio_list;
- dio->bio_list = bio;
- if (dio->waiter)
- wake_up_process(dio->waiter);
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ if (atomic_read(&dio->bio_count) == 0) {
+ aio_complete(dio->iocb, dio->results, 0);
+ kfree(dio);
+ }
 }
 
+
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                 sector_t first_sector, int nr_vecs)
@@ -180,7 +199,7 @@
         bio->bi_size = 0;
         bio->bi_sector = first_sector;
         bio->bi_io_vec[0].bv_page = NULL;
- bio->bi_end_io = dio_bio_end_io;
+ bio->bi_end_io = dio_bio_end_aio;
 
         dio->bio = bio;
         dio->bvec = NULL; /* debug */
@@ -212,75 +231,6 @@
 }
 
 /*
- * Wait for the next BIO to complete. Remove it and return it.
- */
-static struct bio *dio_await_one(struct dio *dio)
-{
- unsigned long flags;
- struct bio *bio;
-
- spin_lock_irqsave(&dio->bio_list_lock, flags);
- while (dio->bio_list == NULL) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (dio->bio_list == NULL) {
- dio->waiter = current;
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
- blk_run_queues();
- schedule();
- spin_lock_irqsave(&dio->bio_list_lock, flags);
- dio->waiter = NULL;
- }
- set_current_state(TASK_RUNNING);
- }
- bio = dio->bio_list;
- dio->bio_list = bio->bi_private;
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
- return bio;
-}
-
-/*
- * Process one completed BIO. No locks are held.
- */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec;
- int page_no;
-
- for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
- struct page *page = bvec[page_no].bv_page;
-
- if (dio->rw == READ)
- set_page_dirty(page);
- page_cache_release(page);
- }
- atomic_dec(&dio->bio_count);
- bio_put(bio);
- return uptodate ? 0 : -EIO;
-}
-
-/*
- * Wait on and process all in-flight BIOs.
- */
-static int dio_await_completion(struct dio *dio)
-{
- int ret = 0;
-
- if (dio->bio)
- dio_bio_submit(dio);
-
- while (atomic_read(&dio->bio_count)) {
- struct bio *bio = dio_await_one(dio);
- int ret2;
-
- ret2 = dio_bio_complete(dio, bio);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
-/*
  * A really large O_DIRECT read or write can generate a lot of BIOs. So
  * to keep the memory consumption sane we periodically reap any completed BIOs
  * during the BIO generation phase.
@@ -528,77 +478,83 @@
 }
 
 int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t get_blocks)
 {
         const unsigned blkbits = inode->i_blkbits;
         unsigned long user_addr;
- int seg, ret2, ret = 0;
- struct dio dio;
- size_t bytes, tot_bytes = 0;
-
- dio.bio = NULL;
- dio.bvec = NULL;
- dio.inode = inode;
- dio.rw = rw;
- dio.blkbits = blkbits;
- dio.block_in_file = offset >> blkbits;
- dio.blocks_available = 0;
-
- dio.boundary = 0;
- dio.reap_counter = 0;
- dio.get_blocks = get_blocks;
- dio.last_block_in_bio = -1;
- dio.next_block_in_bio = -1;
+ int seg, ret = 0;
+ struct dio *dio;
+ size_t bytes;
+
+ dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+
+ dio->bio = NULL;
+ dio->bvec = NULL;
+ dio->inode = inode;
+ dio->iocb = iocb;
+ dio->rw = rw;
+ dio->blkbits = blkbits;
+ dio->block_in_file = offset >> blkbits;
+ dio->blocks_available = 0;
 
- dio.page_errors = 0;
+ dio->results = 0;
+ dio->boundary = 0;
+ dio->reap_counter = 0;
+ dio->get_blocks = get_blocks;
+ dio->last_block_in_bio = -1;
+ dio->next_block_in_bio = -1;
+
+ dio->page_errors = 0;
 
         /* BIO completion state */
- atomic_set(&dio.bio_count, 0);
- spin_lock_init(&dio.bio_list_lock);
- dio.bio_list = NULL;
- dio.waiter = NULL;
+ atomic_set(&dio->bio_count, 0);
+ spin_lock_init(&dio->bio_list_lock);
+ dio->bio_list = NULL;
+ dio->waiter = NULL;
 
         for (seg = 0; seg < nr_segs; seg++) {
                 user_addr = (unsigned long)iov[seg].iov_base;
                 bytes = iov[seg].iov_len;
 
                 /* Index into the first page of the first block */
- dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
- dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+ dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+ dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
                 /* Page fetching state */
- dio.head = 0;
- dio.tail = 0;
- dio.curr_page = 0;
+ dio->head = 0;
+ dio->tail = 0;
+ dio->curr_page = 0;
 
- dio.total_pages = 0;
+ dio->total_pages = 0;
                 if (user_addr & (PAGE_SIZE-1)) {
- dio.total_pages++;
+ dio->total_pages++;
                         bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
                 }
- dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- dio.curr_user_address = user_addr;
+ dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ dio->curr_user_address = user_addr;
         
- ret = do_direct_IO(&dio);
+ ret = do_direct_IO(dio);
 
                 if (ret) {
- dio_cleanup(&dio);
+ dio_cleanup(dio);
                         break;
                 }
 
- tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
- dio.block_in_file) << blkbits);
+ dio->results += iov[seg].iov_len - ((dio->final_block_in_request -
+ dio->block_in_file) << blkbits);
 
         } /* end iovec loop */
 
- ret2 = dio_await_completion(&dio);
- if (ret == 0)
- ret = ret2;
+ if (dio->bio)
+ dio_bio_submit(dio);
+
         if (ret == 0)
- ret = dio.page_errors;
+ ret = dio->page_errors;
         if (ret == 0)
- ret = tot_bytes;
-
+ ret = -EIOCBQUEUED;
         return ret;
 }
 
@@ -606,8 +562,9 @@
  * This is a library function for use by filesystem drivers.
  */
 int
-generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+generic_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t get_blocks)
 {
         int seg;
         size_t size;
@@ -636,19 +593,21 @@
                         goto out;
         }
 
- retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+ retval = direct_io_worker(rw, iocb, inode, iov, offset,
+ nr_segs, get_blocks);
 out:
         return retval;
 }
 
 ssize_t
-generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+generic_file_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
         struct address_space *mapping = inode->i_mapping;
         ssize_t retval;
 
- retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
+ retval = mapping->a_ops->direct_IO(rw, iocb, inode, iov,
+ offset, nr_segs);
         if (inode->i_mapping->nrpages)
                 invalidate_inode_pages2(inode->i_mapping);
         return retval;
diff -Naur -X dontdiff linux-2.5.35/fs/ext2/inode.c linux-2.5.35.aio/fs/ext2/inode.c
--- linux-2.5.35/fs/ext2/inode.c Sun Sep 15 19:18:19 2002
+++ linux-2.5.35.aio/fs/ext2/inode.c Tue Sep 17 09:00:26 2002
@@ -619,10 +619,10 @@
 }
 
 static int
-ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov,loff_t offset, unsigned long nr_segs)
 {
- return generic_direct_IO(rw, inode, iov,
+ return generic_direct_IO(rw, iocb, inode, iov,
                                 offset, nr_segs, ext2_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/fs/ext3/inode.c linux-2.5.35.aio/fs/ext3/inode.c
--- linux-2.5.35/fs/ext3/inode.c Sun Sep 15 19:18:41 2002
+++ linux-2.5.35.aio/fs/ext3/inode.c Tue Sep 17 09:00:26 2002
@@ -1399,7 +1399,7 @@
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file.
  */
-static int ext3_direct_IO(int rw, struct inode *inode,
+static int ext3_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
                         const struct iovec *iov, loff_t offset,
                         unsigned long nr_segs)
 {
@@ -1430,7 +1430,7 @@
                 }
         }
 
- ret = generic_direct_IO(rw, inode, iov, offset,
+ ret = generic_direct_IO(rw, iocb, inode, iov, offset,
                                 nr_segs, ext3_direct_io_get_blocks);
 
 out_stop:
diff -Naur -X dontdiff linux-2.5.35/fs/jfs/inode.c linux-2.5.35.aio/fs/jfs/inode.c
--- linux-2.5.35/fs/jfs/inode.c Sun Sep 15 19:18:15 2002
+++ linux-2.5.35.aio/fs/jfs/inode.c Tue Sep 17 09:00:26 2002
@@ -309,10 +309,11 @@
         return generic_block_bmap(mapping, block, jfs_get_block);
 }
 
-static int jfs_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+static int jfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
 {
- return generic_direct_IO(rw, inode, iov,
+ return generic_direct_IO(rw, iocb, inode, iov,
                                 offset, nr_segs, jfs_get_blocks);
 }
 
diff -Naur -X dontdiff linux-2.5.35/include/linux/fs.h linux-2.5.35.aio/include/linux/fs.h
--- linux-2.5.35/include/linux/fs.h Sun Sep 15 19:18:24 2002
+++ linux-2.5.35.aio/include/linux/fs.h Tue Sep 17 09:04:21 2002
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/radix-tree.h>
 #include <linux/bitops.h>
+#include <linux/fs.h>
 
 #include <asm/atomic.h>
 
@@ -279,6 +280,7 @@
  */
 struct page;
 struct address_space;
+struct kiocb;
 
 struct address_space_operations {
         int (*writepage)(struct page *);
@@ -307,7 +309,7 @@
         int (*bmap)(struct address_space *, long);
         int (*invalidatepage) (struct page *, unsigned long);
         int (*releasepage) (struct page *, int);
- int (*direct_IO)(int, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+ int (*direct_IO)(int, struct kiocb *, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 };
 
 struct backing_dev_info;
@@ -743,7 +745,6 @@
  * read, write, poll, fsync, readv, writev can be called
  * without the big kernel lock held in all filesystems.
  */
-struct kiocb;
 struct file_operations {
         struct module *owner;
         loff_t (*llseek) (struct file *, loff_t, int);
@@ -1248,10 +1249,10 @@
                                 unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
-extern ssize_t generic_file_direct_IO(int rw, struct inode *inode,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec
- *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
+extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, struct inode * inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+extern int generic_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t *get_blocks);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
         unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
diff -Naur -X dontdiff linux-2.5.35/mm/filemap.c linux-2.5.35.aio/mm/filemap.c
--- linux-2.5.35/mm/filemap.c Sun Sep 15 19:18:26 2002
+++ linux-2.5.35.aio/mm/filemap.c Tue Sep 17 09:00:26 2002
@@ -1153,7 +1153,7 @@
                                 nr_segs = iov_shorten((struct iovec *)iov,
                                                         nr_segs, count);
                         }
- retval = generic_file_direct_IO(READ, inode,
+ retval = generic_file_direct_IO(READ, iocb, inode,
                                         iov, pos, nr_segs);
                         if (retval > 0)
                                 *ppos = pos + retval;
@@ -1989,7 +1989,9 @@
                                            current iovec */
         unsigned long seg;
         char *buf;
+ struct kiocb kiocb;
 
+ init_sync_kiocb(&kiocb, file);
         if (unlikely((ssize_t)count < 0))
                 return -EINVAL;
 
@@ -2099,8 +2101,11 @@
                 if (count != ocount)
                         nr_segs = iov_shorten((struct iovec *)iov,
                                                 nr_segs, count);
- written = generic_file_direct_IO(WRITE, inode,
+ written = generic_file_direct_IO(WRITE, &kiocb, inode,
                                         iov, pos, nr_segs);
+ if (written == -EIOCBQUEUED)
+ written = wait_on_sync_kiocb(&kiocb);
+
                 if (written > 0) {
                         loff_t end = pos + written;
                         if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Sep 23 2002 - 22:00:20 EST