Re: [patch 10/19] direct-to-BIO I/O for swapcache pages

From: Andrew Morton (akpm@zip.com.au)
Date: Mon Jun 17 2002 - 02:13:31 EST


Andrew Morton wrote:
>
> ..
> I have an
> additional patch which converts swap to use mpage_writepages(), so we swap
> out in 16-page BIOs. It works fine, but I don't intend to submit that.
> There just doesn't seem to be any significant advantage to it.
>

Just for the record, here is the patch which converts swap writeout to
use large BIOs (via mpage_writepages):

--- 2.5.21/fs/buffer.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/fs/buffer.c Sat Jun 15 17:15:02 2002
@@ -397,7 +397,7 @@ __get_hash_table(struct block_device *bd
         struct buffer_head *head;
         struct page *page;
 
- index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+ index = block >> (mapping_page_shift(bd_mapping) - bd_inode->i_blkbits);
         page = find_get_page(bd_mapping, index);
         if (!page)
                 goto out;
@@ -1667,7 +1667,7 @@ static int __block_write_full_page(struc
          * handle that here by just cleaning them.
          */
 
- block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ block = page->index << (page_shift(page) - inode->i_blkbits);
         head = page_buffers(page);
         bh = head;
 
@@ -1811,8 +1811,8 @@ static int __block_prepare_write(struct
         char *kaddr = kmap(page);
 
         BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > page_size(page));
+ BUG_ON(to > page_size(page));
         BUG_ON(from > to);
 
         blocksize = 1 << inode->i_blkbits;
@@ -1821,7 +1821,7 @@ static int __block_prepare_write(struct
         head = page_buffers(page);
 
         bbits = inode->i_blkbits;
- block = page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = page->index << (page_shift(page) - bbits);
 
         for(bh = head, block_start = 0; bh != head || !block_start;
             block++, block_start=block_end, bh = bh->b_this_page) {
@@ -1966,8 +1966,8 @@ int block_read_full_page(struct page *pa
                 create_empty_buffers(page, blocksize, 0);
         head = page_buffers(page);
 
- blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
- iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ blocks = page_size(page) >> inode->i_blkbits;
+ iblock = page->index << (page_shift(page) - inode->i_blkbits);
         lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
         bh = head;
         nr = 0;
@@ -2054,7 +2054,7 @@ int generic_cont_expand(struct inode *in
         if (size > inode->i_sb->s_maxbytes)
                 goto out;
 
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+ offset = (size & (mapping_page_size(mapping) - 1)); /* Within page */
 
         /* ugh. in prepare/commit_write, if from==to==start of block, we
         ** skip the prepare. make sure we never send an offset for the start
@@ -2063,7 +2063,7 @@ int generic_cont_expand(struct inode *in
         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
                 offset++;
         }
- index = size >> PAGE_CACHE_SHIFT;
+ index = size >> mapping_page_shift(mapping);
         err = -ENOMEM;
         page = grab_cache_page(mapping, index);
         if (!page)
@@ -2097,31 +2097,31 @@ int cont_prepare_write(struct page *page
         unsigned blocksize = 1 << inode->i_blkbits;
         char *kaddr;
 
- while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+ while(page->index > (pgpos = *bytes>>page_shift(page))) {
                 status = -ENOMEM;
                 new_page = grab_cache_page(mapping, pgpos);
                 if (!new_page)
                         goto out;
                 /* we might sleep */
- if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+ if (*bytes>>page_shift(page) != pgpos) {
                         unlock_page(new_page);
                         page_cache_release(new_page);
                         continue;
                 }
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = *bytes & ~page_mask(page);
                 if (zerofrom & (blocksize-1)) {
                         *bytes |= (blocksize-1);
                         (*bytes)++;
                 }
                 status = __block_prepare_write(inode, new_page, zerofrom,
- PAGE_CACHE_SIZE, get_block);
+ page_size(new_page), get_block);
                 if (status)
                         goto out_unmap;
                 kaddr = page_address(new_page);
- memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+ memset(kaddr+zerofrom, 0, page_size(new_page)-zerofrom);
                 flush_dcache_page(new_page);
                 __block_commit_write(inode, new_page,
- zerofrom, PAGE_CACHE_SIZE);
+ zerofrom, page_size(new_page));
                 kunmap(new_page);
                 unlock_page(new_page);
                 page_cache_release(new_page);
@@ -2132,7 +2132,7 @@ int cont_prepare_write(struct page *page
                 zerofrom = offset;
         } else {
                 /* page covers the boundary, find the boundary offset */
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = *bytes & ~page_mask(page);
 
                 /* if we will expand the thing last block will be filled */
                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
@@ -2192,7 +2192,7 @@ int generic_commit_write(struct file *fi
                 unsigned from, unsigned to)
 {
         struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ loff_t pos = ((loff_t)page->index << page_shift(page)) + to;
         __block_commit_write(inode,page,from,to);
         kunmap(page);
         if (pos > inode->i_size) {
@@ -2205,8 +2205,8 @@ int generic_commit_write(struct file *fi
 int block_truncate_page(struct address_space *mapping,
                         loff_t from, get_block_t *get_block)
 {
- unsigned long index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned long index = from >> mapping_page_shift(mapping);
+ unsigned offset = from & (mapping_page_size(mapping) - 1);
         unsigned blocksize, iblock, length, pos;
         struct inode *inode = mapping->host;
         struct page *page;
@@ -2221,7 +2221,7 @@ int block_truncate_page(struct address_s
                 return 0;
 
         length = blocksize - length;
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = index << (mapping_page_shift(mapping) - inode->i_blkbits);
         
         page = grab_cache_page(mapping, index);
         err = -ENOMEM;
@@ -2283,7 +2283,7 @@ out:
 int block_write_full_page(struct page *page, get_block_t *get_block)
 {
         struct inode * const inode = page->mapping->host;
- const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ const unsigned long end_index = inode->i_size >> page_shift(page);
         unsigned offset;
         char *kaddr;
 
@@ -2292,7 +2292,7 @@ int block_write_full_page(struct page *p
                 return __block_write_full_page(inode, page, get_block);
 
         /* Is the page fully outside i_size? (truncate in progress) */
- offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+ offset = inode->i_size & (page_size(page) - 1);
         if (page->index >= end_index+1 || !offset) {
                 unlock_page(page);
                 return -EIO;
@@ -2300,7 +2300,7 @@ int block_write_full_page(struct page *p
 
         /* The page straddles i_size */
         kaddr = kmap(page);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ memset(kaddr + offset, 0, page_size(page) - offset);
         flush_dcache_page(page);
         kunmap(page);
         return __block_write_full_page(inode, page, get_block);
--- 2.5.21/fs/mpage.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/fs/mpage.c Sat Jun 15 17:15:02 2002
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
@@ -22,7 +23,7 @@
 
 /*
  * The largest-sized BIO which this code will assemble, in bytes. Set this
- * to PAGE_CACHE_SIZE if your drivers are broken.
+ * to PAGE_SIZE_MAX if your drivers are broken.
  */
 #define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
 
@@ -165,7 +166,7 @@ do_mpage_readpage(struct bio *bio, struc
 {
         struct inode *inode = page->mapping->host;
         const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = page_size(page) >> blkbits;
         const unsigned blocksize = 1 << blkbits;
         struct bio_vec *bvec;
         sector_t block_in_file;
@@ -175,23 +176,24 @@ do_mpage_readpage(struct bio *bio, struc
         unsigned page_block;
         unsigned first_hole = blocks_per_page;
         struct block_device *bdev = NULL;
- struct buffer_head bh;
+ struct buffer_head map_bh;
 
         if (page_has_buffers(page))
                 goto confused;
 
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = page->index << (page_shift(page) - blkbits);
         last_file_block = (inode->i_size + blocksize - 1) >> blkbits;
+ map_bh.b_page = page;
 
         for (page_block = 0; page_block < blocks_per_page;
                                 page_block++, block_in_file++) {
- bh.b_state = 0;
+ map_bh.b_state = 0;
                 if (block_in_file < last_file_block) {
- if (get_block(inode, block_in_file, &bh, 0))
+ if (get_block(inode, block_in_file, &map_bh, 0))
                                 goto confused;
                 }
 
- if (!buffer_mapped(&bh)) {
+ if (!buffer_mapped(&map_bh)) {
                         if (first_hole == blocks_per_page)
                                 first_hole = page_block;
                         continue;
@@ -202,18 +204,18 @@ do_mpage_readpage(struct bio *bio, struc
 
                 if (page_block) {
                         /* Contiguous blocks? */
- if (bh.b_blocknr != last_page_block + 1)
+ if (map_bh.b_blocknr != last_page_block + 1)
                                 goto confused;
                 } else {
- first_page_block = bh.b_blocknr;
+ first_page_block = map_bh.b_blocknr;
                 }
- last_page_block = bh.b_blocknr;
- bdev = bh.b_bdev;
+ last_page_block = map_bh.b_blocknr;
+ bdev = map_bh.b_bdev;
         }
 
         if (first_hole != blocks_per_page) {
                 memset(kmap(page) + (first_hole << blkbits), 0,
- PAGE_CACHE_SIZE - (first_hole << blkbits));
+ page_size(page) - (first_hole << blkbits));
                 flush_dcache_page(page);
                 kunmap(page);
                 if (first_hole == 0) {
@@ -231,7 +233,7 @@ do_mpage_readpage(struct bio *bio, struc
                 bio = mpage_bio_submit(READ, bio);
 
         if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
+ unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / page_size(page);
 
                 if (nr_bvecs > nr_pages)
                         nr_bvecs = nr_pages;
@@ -246,7 +248,7 @@ do_mpage_readpage(struct bio *bio, struc
         bvec->bv_len = (first_hole << blkbits);
         bvec->bv_offset = 0;
         bio->bi_size += bvec->bv_len;
- if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+ if (buffer_boundary(&map_bh) || (first_hole != blocks_per_page))
                 bio = mpage_bio_submit(READ, bio);
         else
                 *last_block_in_bio = last_page_block;
@@ -324,7 +326,7 @@ mpage_writepage(struct bio *bio, struct
         struct inode *inode = page->mapping->host;
         const unsigned blkbits = inode->i_blkbits;
         unsigned long end_index;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = page_size(page) >> blkbits;
         struct bio_vec *bvec;
         sector_t last_file_block;
         sector_t block_in_file;
@@ -387,13 +389,14 @@ mpage_writepage(struct bio *bio, struct
          * The page has no buffers: map it to disk
          */
         BUG_ON(!PageUptodate(page));
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = page->index << (page_shift(page) - blkbits);
         last_file_block = (inode->i_size - 1) >> blkbits;
         for (page_block = 0; page_block < blocks_per_page;
                                 page_block++, block_in_file++) {
                 struct buffer_head map_bh;
 
                 map_bh.b_state = 0;
+ map_bh.b_page = page;
                 if (get_block(inode, block_in_file, &map_bh, 1))
                         goto confused;
                 if (buffer_new(&map_bh))
@@ -416,13 +419,13 @@ mpage_writepage(struct bio *bio, struct
 
         first_unmapped = page_block;
 
- end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ end_index = inode->i_size >> page_shift(page);
         if (page->index >= end_index) {
- unsigned offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = inode->i_size & (page_size(page) - 1);
 
                 if (page->index > end_index || !offset)
                         goto confused;
- memset(kmap(page) + offset, 0, PAGE_CACHE_SIZE - offset);
+ memset(kmap(page) + offset, 0, page_size(page) - offset);
                 flush_dcache_page(page);
                 kunmap(page);
         }
@@ -431,13 +434,17 @@ page_is_mapped:
 
         /*
          * This page will go to BIO. Do we need to send this BIO off first?
+ * Check for changed bdev - swapper_space striping does this.
          */
- if (bio && (bio->bi_idx == bio->bi_vcnt ||
- *last_block_in_bio != first_page_block - 1))
- bio = mpage_bio_submit(WRITE, bio);
+ if (bio) {
+ if ((bio->bi_idx == bio->bi_vcnt) ||
+ (*last_block_in_bio != first_page_block - 1) ||
+ (bio->bi_bdev != bdev))
+ bio = mpage_bio_submit(WRITE, bio);
+ }
 
         if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
+ unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / page_size(page);
 
                 bio = mpage_alloc(bdev, first_page_block << (blkbits - 9),
                                         nr_bvecs, GFP_NOFS);
--- 2.5.21/include/linux/pagemap.h~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/include/linux/pagemap.h Sat Jun 15 17:15:02 2002
@@ -22,6 +22,12 @@
 #define PAGE_CACHE_MASK PAGE_MASK
 #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
 
+#if PAGE_SIZE > PAGE_CACHE_SIZE
+#define PAGE_SIZE_MAX PAGE_SIZE
+#else
+#define PAGE_SIZE_MAX PAGE_CACHE_SIZE
+#endif
+
 #define page_cache_get(x) get_page(x)
 extern void page_cache_release(struct page *);
 
@@ -97,5 +103,35 @@ static inline void wait_on_page_writebac
                 wait_on_page_bit(page, PG_writeback);
 }
 
+static inline unsigned mapping_page_size(struct address_space *mapping)
+{
+ return (mapping == &swapper_space) ? PAGE_SIZE : PAGE_CACHE_SIZE;
+}
+
+static inline unsigned mapping_page_shift(struct address_space *mapping)
+{
+ return (mapping == &swapper_space) ? PAGE_SHIFT : PAGE_CACHE_SHIFT;
+}
+
+static inline unsigned mapping_page_mask(struct address_space *mapping)
+{
+ return (mapping == &swapper_space) ? PAGE_MASK : PAGE_CACHE_MASK;
+}
+
+static inline unsigned page_size(struct page *page)
+{
+ return mapping_page_size(page->mapping);
+}
+
+static inline unsigned page_shift(struct page *page)
+{
+ return mapping_page_shift(page->mapping);
+}
+
+static inline unsigned page_mask(struct page *page)
+{
+ return mapping_page_mask(page->mapping);
+}
+
 extern void end_page_writeback(struct page *page);
 #endif /* _LINUX_PAGEMAP_H */
--- 2.5.21/mm/page_io.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/mm/page_io.c Sat Jun 15 17:15:03 2002
@@ -15,6 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
+#include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <asm/pgtable.h>
 #include <linux/swapops.h>
@@ -35,6 +36,32 @@ swap_get_block(struct inode *inode, sect
         return 0;
 }
 
+/*
+ * swap_write_get_block() is for use by mpage_writepages(). If it sees a stale
+ * swapcache page (which doesn't need writing), swap_write_get_block() will
+ * return "failure". This causes mpage_writepages() to send off its current
+ * BIO and to fall back to swap_writepage(). Which can simply unlock the page.
+ */
+static int
+swap_write_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ if (remove_exclusive_swap_page(bh_result->b_page))
+ return -1;
+ return swap_get_block(inode, iblock, bh_result, create);
+}
+
+/*
+ * We may have stale swap cache pages in memory: notice them here and get
+ * rid of the unnecessary final write.
+ */
+static int swap_writepage(struct page *page)
+{
+ printk("swap_writepage\n");
+ unlock_page(page);
+ return 0;
+}
+
 static struct bio *
 get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io)
 {
@@ -57,17 +84,6 @@ get_swap_bio(int gfp_flags, struct page
         return bio;
 }
 
-static void end_swap_bio_write(struct bio *bio)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- if (!uptodate)
- SetPageError(page);
- end_page_writeback(page);
- bio_put(bio);
-}
-
 static void end_swap_bio_read(struct bio *bio)
 {
         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -83,32 +99,6 @@ static void end_swap_bio_read(struct bio
         bio_put(bio);
 }
 
-/*
- * We may have stale swap cache pages in memory: notice
- * them here and get rid of the unnecessary final write.
- */
-static int swap_writepage(struct page *page)
-{
- struct bio *bio;
- int ret = 0;
-
- if (remove_exclusive_swap_page(page)) {
- unlock_page(page);
- goto out;
- }
- bio = get_swap_bio(GFP_NOFS, page, end_swap_bio_write);
- if (bio == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- kstat.pswpout++;
- SetPageWriteback(page);
- unlock_page(page);
- submit_bio(WRITE, bio);
-out:
- return ret;
-}
-
 int swap_readpage(struct file *file, struct page *page)
 {
         struct bio *bio;
@@ -125,30 +115,75 @@ int swap_readpage(struct file *file, str
 out:
         return ret;
 }
+
+static int swap_writepages(struct address_space *mapping, int *nr_to_write)
+{
+ int to_write = *nr_to_write;
+ int ret;
+
+ ret = mpage_writepages(mapping, nr_to_write, swap_write_get_block);
+ kstat.pswpout += to_write - *nr_to_write;
+ return ret;
+}
+
 /*
  * swapper_space doesn't have a real inode, so it gets a special vm_writeback()
  * so we don't need swap special cases in generic_vm_writeback().
  *
- * Swap pages are PageLocked and PageWriteback while under writeout so that
- * memory allocators will throttle against them.
+ * Swap pages are PageWriteback while under writeout so that memory allocators
+ * will throttle against them.
  */
 static int swap_vm_writeback(struct page *page, int *nr_to_write)
 {
- struct address_space *mapping = page->mapping;
-
         unlock_page(page);
- return generic_writepages(mapping, nr_to_write);
+ return swap_writepages(page->mapping, nr_to_write);
 }
 
 struct address_space_operations swap_aops = {
         vm_writeback: swap_vm_writeback,
         writepage: swap_writepage,
+ writepages: swap_writepages,
         readpage: swap_readpage,
         sync_page: block_sync_page,
         set_page_dirty: __set_page_dirty_nobuffers,
 };
 
 /*
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * Readahead is performed against a single device. Which is perhaps suboptimal
+ * when striped swap is being used. But given that swap uses a one meg chunk
+ * size for striping, chances are that readahead is reading the right pages.
+ *
+ * It would be possible to use mpage and the generic readahead code here.
+ * We'd have to clone mpage_readpages because add_to_swap_cache() does special
+ * things. Doubtful if all this would help much, really.
+ */
+void swapin_readahead(swp_entry_t entry)
+{
+ int i, num;
+ unsigned long offset;
+
+ /*
+ * Get the number of handles we should do readahead io to.
+ */
+ num = valid_swaphandles(entry, &offset);
+ for (i = 0; i < num; offset++, i++) {
+ struct page *new_page;
+ swp_entry_t ra_entry;
+
+ ra_entry = swp_entry(swp_type(entry), offset);
+ new_page = read_swap_cache_async(ra_entry);
+ if (!new_page)
+ break;
+ page_cache_release(new_page);
+ }
+}
+
+/*
  * A scruffy utility function to read or write an arbitrary swap page
  * and wait on the I/O.
  */
--- 2.5.21/mm/swap_state.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/mm/swap_state.c Sat Jun 15 17:15:03 2002
@@ -8,13 +8,11 @@
  */
 
 #include <linux/mm.h>
-#include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h> /* block_sync_page() */
 
 #include <asm/pgtable.h>
 
@@ -124,7 +122,7 @@ void delete_from_swap_cache(struct page
 
         BUG_ON(!PageLocked(page));
         BUG_ON(PageWriteback(page));
- BUG_ON(page_has_buffers(page));
+ BUG_ON(PagePrivate(page));
   
         entry.val = page->index;
 
@@ -192,7 +190,7 @@ int move_from_swap_cache(struct page *pa
 
         BUG_ON(!PageLocked(page));
         BUG_ON(PageWriteback(page));
- BUG_ON(page_has_buffers(page));
+ BUG_ON(PagePrivate(page));
 
         write_lock(&swapper_space.page_lock);
         write_lock(&mapping->page_lock);
--- 2.5.21/mm/swapfile.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/mm/swapfile.c Sat Jun 15 17:15:03 2002
@@ -37,6 +37,11 @@ struct swap_info_struct swap_info[MAX_SW
 
 #define SWAPFILE_CLUSTER 256
 
+/*
+ * Switch to a new device after this many pages
+ */
+#define SWAP_STRIPE_PAGES ((1024*1024)/PAGE_SIZE)
+
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
         unsigned long offset;
@@ -47,7 +52,8 @@ static inline int scan_swap_map(struct s
          * first-free allocation, starting a new cluster. This
          * prevents us from scattering swap pages all over the entire
          * swap partition, so that we reduce overall disk seek times
- * between swap pages. -- sct */
+ * between swap pages. -- sct
+ */
         if (si->cluster_nr) {
                 while (si->cluster_next <= si->highest_bit) {
                         offset = si->cluster_next++;
@@ -59,29 +65,35 @@ static inline int scan_swap_map(struct s
         }
         si->cluster_nr = SWAPFILE_CLUSTER;
 
- /* try to find an empty (even not aligned) cluster. */
+ /*
+ * Try to find an empty (even not aligned) cluster
+ */
         offset = si->lowest_bit;
- check_next_cluster:
- if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
- {
+
+check_next_cluster:
+ if (offset + SWAPFILE_CLUSTER - 1 <= si->highest_bit) {
                 int nr;
- for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
- if (si->swap_map[nr])
- {
- offset = nr+1;
+
+ for (nr = offset; nr < offset + SWAPFILE_CLUSTER; nr++) {
+ if (si->swap_map[nr]) {
+ offset = nr + 1;
                                 goto check_next_cluster;
                         }
- /* We found a completly empty cluster, so start
- * using it.
+ }
+
+ /*
+ * We found a completly empty cluster, so start using it.
                  */
                 goto got_page;
         }
- /* No luck, so now go finegrined as usual. -Andrea */
- for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
+ /*
+ * No luck, so now go finegrained as usual. -Andrea
+ */
+ for (offset = si->lowest_bit; offset <= si->highest_bit; offset++) {
                 if (si->swap_map[offset])
                         continue;
                 si->lowest_bit = offset+1;
- got_page:
+got_page:
                 if (offset == si->lowest_bit)
                         si->lowest_bit++;
                 if (offset == si->highest_bit)
@@ -92,7 +104,7 @@ static inline int scan_swap_map(struct s
                 }
                 si->swap_map[offset] = 1;
                 nr_swap_pages--;
- si->cluster_next = offset+1;
+ si->cluster_next = offset + 1;
                 return offset;
         }
         si->lowest_bit = si->max;
@@ -100,9 +112,11 @@ static inline int scan_swap_map(struct s
         return 0;
 }
 
+int akpm;
+
 swp_entry_t get_swap_page(void)
 {
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
         unsigned long offset;
         swp_entry_t entry;
         int type, wrapped = 0;
@@ -122,11 +136,27 @@ swp_entry_t get_swap_page(void)
                         offset = scan_swap_map(p);
                         swap_device_unlock(p);
                         if (offset) {
- entry = swp_entry(type,offset);
+ static int stripe;
+
+ entry = swp_entry(type, offset);
+
+ /*
+ * Keep using the same device for a decent
+ * number of pages so that we can build nice
+ * big BIOs against it.
+ */
+ if (stripe++ < SWAP_STRIPE_PAGES)
+ goto out;
+ stripe = 0;
+
+ /*
+ * Select the next swapdevice. Stripe across
+ * devices if the priorities are equal.
+ */
                                 type = swap_info[type].next;
                                 if (type < 0 ||
                                         p->prio != swap_info[type].prio) {
- swap_list.next = swap_list.head;
+ swap_list.next = swap_list.head;
                                 } else {
                                         swap_list.next = type;
                                 }
@@ -139,12 +169,15 @@ swp_entry_t get_swap_page(void)
                                 type = swap_list.head;
                                 wrapped = 1;
                         }
- } else
+ } else {
                         if (type < 0)
                                 goto out; /* out of swap space */
+ }
         }
 out:
         swap_list_unlock();
+ if (akpm)
+ printk("%d:%lu\n", swp_type(entry), swp_offset(entry));
         return entry;
 }
 
--- 2.5.21/mm/memory.c~swap-mpage-write Sat Jun 15 17:15:02 2002
+++ 2.5.21-akpm/mm/memory.c Sat Jun 15 17:15:03 2002
@@ -1112,32 +1112,6 @@ out:
         return 0;
 }
 
-/*
- * Primitive swap readahead code. We simply read an aligned block of
- * (1 << page_cluster) entries in the swap area. This method is chosen
- * because it doesn't cost us any seek time. We also make sure to queue
- * the 'original' request together with the readahead ones...
- */
-void swapin_readahead(swp_entry_t entry)
-{
- int i, num;
- struct page *new_page;
- unsigned long offset;
-
- /*
- * Get the number of handles we should do readahead io to.
- */
- num = valid_swaphandles(entry, &offset);
- for (i = 0; i < num; offset++, i++) {
- /* Ok, do the async read-ahead now */
- new_page = read_swap_cache_async(swp_entry(swp_type(entry), offset));
- if (!new_page)
- break;
- page_cache_release(new_page);
- }
- return;
-}
-
 /*
  * We hold the mm semaphore and the page_table_lock on entry and
  * should release the pagetable lock on exit..

-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Jun 23 2002 - 22:00:12 EST