[RFC] ext4-delayed-allocation.patch

From: Alex Tomas
Date: Fri Dec 22 2006 - 16:11:02 EST




Index: linux-2.6.20-rc1/include/linux/ext4_fs_i.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_i.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_i.h 2006-12-22 22:56:04.000000000 +0300
@@ -153,6 +153,11 @@ struct ext4_inode_info {

unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
+
+ __u32 i_blocks_reserved;
+ __u32 i_md_reserved;
+ spinlock_t i_wb_reserved_lock; /* to protect i_md_reserved */
+ atomic_t i_wb_writers;
};

#endif /* _LINUX_EXT4_FS_I */
Index: linux-2.6.20-rc1/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs.h 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs.h 2006-12-22 22:56:04.000000000 +0300
@@ -401,6 +401,7 @@ struct ext4_inode {
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
+#define EXT4_MOUNT_DELAYED_ALLOC 0x1000000/* Delayed allocation support */

/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
@@ -994,6 +995,18 @@ ext4_get_blocks_wrap(handle_t *handle, s
}


+/* writeback.c */
+extern int ext4_wb_writepages(struct address_space *, struct writeback_control *);
+extern int ext4_wb_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to);
+extern int ext4_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int ext4_wb_writepage(struct page *, struct writeback_control *);
+extern void ext4_wb_invalidatepage(struct page *, unsigned long);
+extern int ext4_wb_releasepage(struct page *, gfp_t);
+extern int ext4_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+extern void ext4_wb_init(struct super_block *);
+extern void ext4_wb_release(struct super_block *);
+
#endif /* __KERNEL__ */

#endif /* _LINUX_EXT4_FS_H */
Index: linux-2.6.20-rc1/include/linux/ext4_fs_sb.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_sb.h 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_sb.h 2006-12-22 22:56:04.000000000 +0300
@@ -94,6 +94,17 @@ struct ext4_sb_info {
unsigned long s_ext_blocks;
unsigned long s_ext_extents;
#endif
+
+ atomic_t s_wb_congested;
+ atomic_t s_wb_single_pages;
+ atomic_t s_wb_collisions_sp;
+ atomic_t s_wb_allocated;
+ atomic_t s_wb_reqs;
+ atomic_t s_wb_nr_to_write;
+ atomic_t s_wb_collisions;
+ atomic_t s_wb_blocks;
+ atomic_t s_wb_extents;
+ atomic_t s_wb_dropped;
};

#endif /* _LINUX_EXT4_FS_SB */
Index: linux-2.6.20-rc1/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_extents.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_extents.h 2006-12-22 22:56:04.000000000 +0300
@@ -193,6 +193,7 @@ extern int ext4_ext_calc_credits_for_ins
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);

#endif /* _LINUX_EXT4_EXTENTS */

Index: linux-2.6.20-rc1/fs/ext4/super.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/super.c 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/super.c 2006-12-22 22:56:04.000000000 +0300
@@ -439,6 +439,7 @@ static void ext4_put_super (struct super
struct ext4_super_block *es = sbi->s_es;
int i;

+ ext4_wb_release(sb);
ext4_reserve_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
@@ -506,6 +507,13 @@ static struct inode *ext4_alloc_inode(st
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+
+ /* FIXME: these wb-related fields could be initialized once */
+ ei->i_blocks_reserved = 0;
+ ei->i_md_reserved = 0;
+ atomic_set(&ei->i_wb_writers, 0);
+ spin_lock_init(&ei->i_wb_reserved_lock);
+
return &ei->vfs_inode;
}

@@ -729,7 +737,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents,
+ Opt_grpquota, Opt_extents, Opt_delayed_alloc,
};

static match_table_t tokens = {
@@ -780,6 +788,7 @@ static match_table_t tokens = {
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
{Opt_extents, "extents"},
+ {Opt_delayed_alloc, "delalloc"},
{Opt_err, NULL},
{Opt_resize, "resize"},
};
@@ -1094,6 +1103,9 @@ clear_qf_name:
else
clear_opt(sbi->s_mount_opt, BARRIER);
break;
+ case Opt_delayed_alloc:
+ set_opt(sbi->s_mount_opt, DELAYED_ALLOC);
+ break;
case Opt_ignore:
break;
case Opt_resize:
@@ -1869,6 +1881,7 @@ static int ext4_fill_super (struct super

ext4_ext_init(sb);
ext4_reserve_init(sb);
+ ext4_wb_init(sb);

lock_kernel();
return 0;
Index: linux-2.6.20-rc1/fs/ext4/extents.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/extents.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/extents.c 2006-12-22 22:56:04.000000000 +0300
@@ -2159,6 +2159,36 @@ int ext4_ext_writepage_trans_blocks(stru
return needed;
}

+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+
+ rcap = ext4_ext_space_root(inode);
+ if (blocks <= rcap) {
+ /* all extents fit to the root */
+ return 0;
+ }
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ num = leafs = (blocks + lcap - 1) / lcap;
+ if (leafs <= rcap) {
+ /* all pointers to leafs fit to the root */
+ return leafs;
+ }
+
+ /* ok. we need separate index block(s) to link all leaf blocks */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
EXPORT_SYMBOL(ext4_mark_inode_dirty);
EXPORT_SYMBOL(ext4_ext_invalidate_cache);
EXPORT_SYMBOL(ext4_ext_insert_extent);
Index: linux-2.6.20-rc1/fs/ext4/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/Makefile 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/Makefile 2006-12-22 22:56:04.000000000 +0300
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o
+ ext4_jbd2.o writeback.o

ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
Index: linux-2.6.20-rc1/fs/ext4/writeback.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/writeback.c 2006-11-30 15:32:10.563465031 +0300
+++ linux-2.6.20-rc1/fs/ext4/writeback.c 2006-12-22 22:59:33.000000000 +0300
@@ -0,0 +1,1167 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@xxxxxxxxxxxxx
+ * Written by Alex Tomas <alex@xxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * TODO:
+ * MUST:
+ * - flush dirty pages in -ENOSPC case in order to free reserved blocks
+ * - direct I/O support
+ * - blocksize != PAGE_CACHE_SIZE support
+ * - store last unwritten page in ext4_wb_writepages() and
+ * continue from it in a next run
+ * WISH:
+ * - should ext4_wb_writepage() try to flush neighbours?
+ * - ext4_wb_block_truncate_page() must flush partial truncated pages
+ * - reservation can be done per write-request in ext4_file_write()
+ * rather than per-page in ext4_wb_commit_write() -- it's quite
+ * expensive to recalculate amount of required metadata for evey page
+ * - re-allocation to improve layout
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/ext4_fs_extents.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/spinlock.h>
+
+/*
+ * If EXT4_WB_STATS is defined, then some stats are collected.
+ * It will be showed upont umount time.
+ */
+#define EXT4_WB_STATS
+
+
+/*
+ * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
+ * small I/Os ignoring ->writepages() if mapping hasn't enough
+ * contig. dirty pages
+ */
+#define EXT4_WB_SKIP_SMALL__
+
+#define WB_ASSERT(__x__) if (!(__x__)) BUG();
+
+#define WB_DEBUG__
+#ifdef WB_DEBUG
+#define wb_debug(fmt,a...) printk(fmt, ##a);
+#else
+#define wb_debug(fmt,a...)
+#endif
+
+#define WB_MAX_PAGES_PER_EXTENT 32768
+
+#define WB_PAGES_PER_ARRAY 60
+
+struct ext4_wb_pages {
+ struct list_head list;
+ struct page *pages[WB_PAGES_PER_ARRAY];
+ unsigned short num, start;
+};
+
+struct ext4_wb_control {
+ pgoff_t start;
+ int len, extents;
+ int blocks_to_release;
+ struct ext4_wb_pages *pages;
+ struct list_head list;
+ struct address_space *mapping;
+};
+
+
+void ext4_wb_invalidatepage(struct page *, unsigned long);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+
+
+static struct page * ext4_wb_pull_page(struct ext4_wb_control *wc)
+{
+ struct ext4_wb_pages *wp = wc->pages;
+
+ BUG_ON(wp == NULL);
+ BUG_ON(list_empty(&wc->list));
+ BUG_ON(list_empty(&wp->list));
+ if (wp->start == wp->num) {
+ list_del(&wp->list);
+ kfree(wp);
+ if (list_empty(&wc->list))
+ return NULL;
+ wp = list_entry(wc->list.next, struct ext4_wb_pages, list);
+ wc->pages = wp;
+ }
+ BUG_ON(list_empty(&wp->list));
+ return wp->pages[wp->start++];
+}
+
+static struct bio * ext4_wb_bio_alloc(struct inode *inode,
+ sector_t first_block, int nr_vecs)
+{
+ int gfp_flags = GFP_NOFS | __GFP_HIGH;
+ struct bio *bio;
+ int maxreq;
+
+ maxreq = bio_get_nr_vecs(inode->i_sb->s_bdev);
+ if (maxreq < nr_vecs)
+ nr_vecs = maxreq;
+
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_sector = first_block << (inode->i_blkbits - 9);
+ }
+ return bio;
+}
+
+static int ext4_wb_end_io(struct bio *bio, unsigned int bytes, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ if (bio->bi_size)
+ return 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (!uptodate)
+ SetPageError(page);
+ end_page_writeback(page);
+ } while (bvec >= bio->bi_io_vec);
+ bio_put(bio);
+ return 0;
+}
+
+static struct bio *ext4_wb_bio_submit(struct bio *bio, handle_t *handle)
+{
+ bio->bi_end_io = ext4_wb_end_io;
+ submit_bio(WRITE, bio);
+ return NULL;
+}
+
+int inline ext4_wb_reserve_space_page(struct page *page, int blocks)
+{
+ struct inode *inode = page->mapping->host;
+ int total, mdb, err;
+
+ wb_debug("reserve %d blocks for page %lu from inode %lu\n",
+ blocks, page->index, inode->i_ino);
+
+ /* user wants us to reserve blocks for his file. reserving space
+ * for his (data) blocks isn't enough because adding block may
+ * involve allocation index/leaf blocks for tree/blockmap.
+ * so, we need to calculate numbers of needed metadata for worst
+ * case: block per extent */
+
+ spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+ total = EXT4_I(inode)->i_blocks_reserved + blocks;
+ mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+ /* if blockmap needs more metadata, we have to reserve difference */
+ BUG_ON(mdb < EXT4_I(inode)->i_md_reserved);
+ mdb = mdb - EXT4_I(inode)->i_md_reserved;
+
+ err = ext4_reserve_blocks(inode->i_sb, mdb + blocks);
+ if (err) {
+ /* blocks are exhausted? */
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+ return err;
+ }
+
+ /* blocks have been reserved, account this. I believe
+ * inode's fields are protected by inode->i_sem */
+ EXT4_I(inode)->i_blocks_reserved += blocks;
+ EXT4_I(inode)->i_md_reserved += mdb;
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ /* we have reserved space on a disk for the page */
+ SetPageBooked(page);
+ return 0;
+}
+
+/*
+ * release space reserved for @blocks of data
+ * @used signals that @blocks got really allocated and we just
+ * need to release corresponded over-reserved metadata
+ */
+int inline ext4_wb_release_space(struct inode *inode, int blocks, int used)
+{
+ int total, mdb, release;
+
+ spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ total = EXT4_I(inode)->i_blocks_reserved - blocks;
+ mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+ /* if blockmap needs lesser metadata, we may release difference */
+ BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+ mdb = EXT4_I(inode)->i_md_reserved - mdb;
+
+ release = mdb;
+ /* drop reservation only for non-used blocks */
+ if (!used)
+ release += blocks;
+ wb_debug("%u %s: release %d/%d blocks from %u/%u reserved for inode %lu\n",
+ blocks, used ? "allocated" : "dropped", used ? 0 : blocks,
+ mdb, EXT4_I(inode)->i_blocks_reserved,
+ EXT4_I(inode)->i_md_reserved, inode->i_ino);
+ if (release)
+ ext4_release_blocks(inode->i_sb, release);
+
+ /* update per-inode reservations */
+ BUG_ON(blocks > EXT4_I(inode)->i_blocks_reserved);
+ EXT4_I(inode)->i_blocks_reserved -= blocks;
+ BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+ EXT4_I(inode)->i_md_reserved -= mdb;
+
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ return 0;
+}
+
+static inline int ext4_wb_drop_page_reservation(struct page *page)
+{
+ /* we just allocated blocks for this page. those blocks (and
+ * probably metadata for them) were reserved before. now we
+ * should drop reservation mark from the page. if we didn't
+ * do that then ->invalidatepage() may think page still holds
+ * reserved blocks. we could release reserved blocks right
+ * now, but I'd prefer to make this once per several blocks */
+ wb_debug("drop reservation from page %lu from inode %lu\n",
+ page->index, page->mapping->host->i_ino);
+ BUG_ON(!PageBooked(page));
+ ClearPageBooked(page);
+ return 0;
+}
+
+static int ext4_wb_submit_extent(struct ext4_wb_control *wc, handle_t *handle,
+ struct ext4_extent *ex, int new)
+{
+ struct inode *inode = wc->mapping->host;
+ int blkbits = inode->i_blkbits;
+ struct page *page;
+ unsigned long blk, off, len, remain;
+ unsigned long pstart, plen, prev;
+ struct bio *bio = NULL;
+ int nr_pages;
+
+ /*
+ * we have list of pages in wc and block numbers in ex
+ * let's cook bios from them and start real I/O
+ */
+
+ BUG_ON(PAGE_CACHE_SHIFT < blkbits);
+ BUG_ON(list_empty(&wc->list));
+
+ wb_debug("cook and submit bios for %u/%u/%u for %lu/%u\n",
+ ex->ee_block, ex->ee_len, ex->ee_start, wc->start, wc->len);
+
+ blk = ex->ee_block;
+ remain = ex->ee_len;
+ wc->extents++;
+
+ while (remain) {
+ page = ext4_wb_pull_page(wc);
+ if (page == NULL)
+ break;
+
+ pstart = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ plen = PAGE_SIZE >> blkbits;
+ if (pstart > blk) {
+ /* probably extent covers long space and page
+ * to be written in the middle of it */
+ BUG_ON(pstart - blk >= remain);
+ remain -= pstart - blk;
+ blk = pstart;
+ }
+ BUG_ON(blk < pstart || blk >= pstart + plen);
+
+ BUG_ON(!PageUptodate(page));
+ /* page can get here via mmap(2)
+ * BUG_ON(!PagePrivate(page));*/
+ BUG_ON(new && PageMappedToDisk(page));
+ BUG_ON(!new && !PageMappedToDisk(page));
+ SetPageMappedToDisk(page);
+ if (new && PagePrivate(page)) {
+ /* space is just allocated and it was reserved in
+ * ->commit_write(). time to release reservation.
+ * space may not be reserved if page gets dirty
+ * via mmap. should we reserve it in ->mmap() ? */
+ prev = min(plen, remain);
+ ext4_wb_drop_page_reservation(page);
+ wc->blocks_to_release += prev;
+ }
+
+alloc_new_bio:
+ if (bio == NULL) {
+ /* +2 because head/tail may belong to different pages */
+ nr_pages = (ex->ee_len - (blk - ex->ee_block));
+ nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
+ off = ex->ee_start + (blk - ex->ee_block);
+ bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
+ if (bio == NULL)
+ return -ENOMEM;
+ }
+
+ off = (blk - pstart) << blkbits;
+ prev = min(plen, remain);
+ len = prev << blkbits;
+ if (bio_add_page(bio, page, len, off) < len) {
+ bio = ext4_wb_bio_submit(bio, handle);
+ goto alloc_new_bio;
+ }
+ remain -= prev;
+ blk += prev;
+ if (blk < pstart + plen) {
+ /* extent covers part of the page only.
+ * it's possible that next extent covers
+ * the tail. so, we leave page */
+ printk("blk %lu pstart %lu plen %lu remain %lu prev %lu\n",
+ blk, pstart, plen, remain, prev);
+ wc->pages->start--;
+ BUG_ON(remain != 0);
+ }
+ }
+ if (bio)
+ ext4_wb_bio_submit(bio, handle);
+ BUG_ON(new && remain != 0);
+ return 0;
+}
+
+static ext4_fsblk_t
+ext4_wb_find_goal(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t block)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t bg_start;
+ unsigned long colour;
+ int depth;
+
+ if (path) {
+ struct ext4_extent *ex;
+ depth = path->p_depth;
+
+ /* try to predict block placement */
+ if ((ex = path[depth].p_ext))
+ return ex->ee_start + (block - ex->ee_block);
+
+ /* it looks index is empty
+ * try to find starting from index itself */
+ if (path[depth].p_bh)
+ return path[depth].p_bh->b_blocknr;
+ }
+
+ /* OK. use inode's group */
+ bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ return bg_start + colour + block;
+}
+
+static int ext4_wb_handle_extent(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_ext_cache *ec,
+ void *cbdata)
+{
+ struct ext4_wb_control *wc = cbdata;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t goal, pblock;
+ unsigned long tgen, count;
+ struct ext4_extent nex;
+ loff_t new_i_size;
+ handle_t *handle;
+ int i, err;
+
+ if (ec->ec_type == EXT4_EXT_CACHE_EXTENT) {
+ /*
+ * The extent is already allocated. The only thing
+ * we have to do is to flush correspondend pages.
+ */
+ wb_debug("extent %u/%u/%u exist\n",
+ (unsigned) ec->ec_block,
+ (unsigned) ec->ec_len,
+ (unsigned) ec->ec_start);
+ nex.ee_start = ec->ec_start;
+ nex.ee_block = ec->ec_block;
+ nex.ee_len = ec->ec_len;
+ err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
+
+ /* correct on-disk size, if we grow within
+ * already allocated block */
+ new_i_size = (loff_t) nex.ee_block + nex.ee_len;
+ new_i_size = new_i_size << inode->i_blkbits;
+ if (new_i_size > i_size_read(inode))
+ new_i_size = i_size_read(inode);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ EXT4_I(inode)->i_disksize = new_i_size;
+ ext4_dirty_inode(inode);
+ }
+ return err;
+ }
+
+ wb_debug("extent %u/%u DOES NOT exist\n", ec->ec_block, ec->ec_len);
+
+ /* space for some pages we want to flush hasn't allocated
+ * yet. so, it's time to allocate space */
+ tgen = EXT4_I(inode)->i_ext_generation;
+ count = ext4_ext_calc_credits_for_insert(inode, path);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ handle = ext4_journal_start(inode, count + EXT4_DATA_TRANS_BLOCKS(sb) + 1);
+ if (IS_ERR(handle)) {
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ return PTR_ERR(handle);
+ }
+
+ /* FIXME: we could analyze current path and advice allocator
+ * to find additional blocks if goal can't be allocated
+ * this is for better interaction between extents and mballoc
+ * plus this should improve overall performance */
+
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ if (tgen != EXT4_I(inode)->i_ext_generation) {
+ /* the tree has changed. so path can be invalid at moment */
+ ext4_journal_stop(handle);
+ return EXT_REPEAT;
+ }
+
+ goal = ext4_wb_find_goal(inode, path, ec->ec_block);
+ count = ec->ec_len;
+
+ /* if this is a tail of closed file, ask allocator don't preallocate */
+ new_i_size = i_size_read(inode) + sb->s_blocksize - 1;
+ new_i_size = new_i_size >> inode->i_blkbits;
+ if (ec->ec_block + count == new_i_size &&
+ !atomic_read(&inode->i_writecount)) {
+ /* XXX: disable preallocation for tail */
+ }
+
+ /* this is a hack to tell the allocator that blocks
+ * we are going to allocated are already reserved */
+ EXT4_I(inode)->i_state |= EXT4_STATE_BLOCKS_RESERVED;
+ pblock = ext4_new_blocks(handle, inode, goal, &count, &err);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_BLOCKS_RESERVED;
+
+ if (!pblock)
+ goto out;
+
+ BUG_ON(count > ec->ec_len);
+ BUG_ON(count == 0);
+ wb_debug("allocated %llu/%lu for %lu (asked %u)\n",
+ pblock, count, inode->i_ino, ec->ec_len);
+
+ /* insert new extent */
+ nex.ee_start = pblock;
+ nex.ee_start_hi = 0;
+ nex.ee_len = count;
+ nex.ee_block = ec->ec_block;
+ err = ext4_ext_insert_extent(handle, inode, path, &nex);
+ if (err)
+ goto out;
+
+ /*
+ * Putting len of the actual extent we just inserted,
+ * we are asking ext4_ext_walk_space() to continue
+ * scaning after that block
+ */
+ ec->ec_len = nex.ee_len;
+ BUG_ON(nex.ee_len == 0);
+
+#ifdef EXT4_WB_STATS
+ atomic_add(nex.ee_len, &EXT4_SB(inode->i_sb)->s_wb_allocated);
+#endif
+
+ wb_debug("inserted %lu/%lu/%lu for %lu (asked %u)\n",
+ (unsigned long) nex.ee_block, (unsigned long) nex.ee_len,
+ (unsigned long) nex.ee_start, inode->i_ino, ec->ec_len);
+
+ /*
+ * Important! The nex can change after insert. So do not
+ * use ec for following
+ */
+
+ /* block have been allocated for data, so time to drop dirty
+ * in correspondend buffer_heads to prevent corruptions */
+ for (i = 0; i < nex.ee_len; i++)
+ unmap_underlying_metadata(sb->s_bdev, nex.ee_start + i);
+
+ /* correct on-disk inode size */
+ if (nex.ee_len > 0) {
+ new_i_size = (loff_t) nex.ee_block + nex.ee_len;
+ new_i_size = new_i_size << inode->i_blkbits;
+ if (new_i_size > i_size_read(inode))
+ new_i_size = i_size_read(inode);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ EXT4_I(inode)->i_disksize = new_i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+
+ if (ext4_should_order_data(inode))
+ err = ext4_wb_submit_extent(wc, handle, &nex, 1);
+ else
+ err = ext4_wb_submit_extent(wc, NULL, &nex, 1);
+
+ /* we don't want to recalculate needed reservation for
+ * each page. we may do this for each new extent */
+ ext4_wb_release_space(inode, wc->blocks_to_release, 1);
+ wc->blocks_to_release = 0;
+
+out:
+ ext4_journal_stop(handle);
+ if (err)
+ printk("EXT4-fs: writeback error = %d\n", err);
+ return err;
+}
+
+static int ext4_wb_flush(struct ext4_wb_control *wc)
+{
+ struct list_head *cur, *tmp;
+ struct inode *inode;
+ int err, num = 0;
+
+ if (wc->len == 0)
+ return 0;
+
+ inode = wc->mapping->host;
+ wb_debug("start flushing %lu/%u from inode %lu\n",
+ wc->start, wc->len, inode->i_ino);
+
+ wc->pages = list_entry(wc->list.next, struct ext4_wb_pages, list);
+ wc->extents = 0;
+
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ /* FIXME: last page may be partial */
+ err = ext4_ext_walk_space(inode, wc->start, wc->len,
+ ext4_wb_handle_extent, wc);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ list_for_each_safe(cur, tmp, &wc->list) {
+ struct ext4_wb_pages *wp;
+ wp = list_entry(cur, struct ext4_wb_pages, list);
+ if (err) {
+ while (wp->start < wp->num) {
+ struct page *page = wp->pages[wp->start];
+ BUG_ON(!PageWriteback(page));
+ end_page_writeback(page);
+ __set_page_dirty_nobuffers(page);
+ wp->start++;
+ }
+ } else {
+ BUG_ON(num != 0);
+ BUG_ON(wp->start != wp->num - 1 &&
+ wp->start != wp->num);
+ }
+ list_del(&wp->list);
+ kfree(wp);
+ num++;
+ }
+ wc->pages = NULL;
+ wc->len = 0;
+ wc->extents = 0;
+
+ return err;
+}
+
+static int ext4_wb_add_page(struct ext4_wb_control *wc, struct page *page)
+{
+ struct ext4_wb_pages * wp = wc->pages;
+
+ if (wp == NULL || wp->num == WB_PAGES_PER_ARRAY) {
+ wp = kmalloc(sizeof(struct ext4_wb_pages), GFP_NOFS);
+ if (wp == NULL) {
+ printk("no mem for ext4_wb_pages!\n");
+ return -ENOMEM;
+ }
+ wp->num = 0;
+ wp->start = 0;
+ list_add_tail(&wp->list, &wc->list);
+ wc->pages = wp;
+ }
+
+ wp->pages[wp->num] = page;
+ wp->num++;
+
+ return 0;
+}
+
+static inline void
+ext4_wb_init_control(struct ext4_wb_control *wc, struct address_space *mapping)
+{
+ wc->mapping = mapping;
+ wc->len = 0;
+ wc->blocks_to_release = 0;
+ INIT_LIST_HEAD(&wc->list);
+ wc->pages = NULL;
+}
+
+static inline int
+ext4_wb_can_merge(struct ext4_wb_control *wc, unsigned long next)
+{
+ if (wc->start + wc->len == next &&
+ wc->len <= WB_MAX_PAGES_PER_EXTENT)
+ return 1;
+ return 0;
+}
+
+int ext4_wb_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct inode *inode = mapping->host;
+ int nr_pages, i, err = 0, done = 0;
+ struct ext4_wb_control wc;
+ struct pagevec pvec;
+ pgoff_t index = 0;
+ int written = 0;
+ int extents = 0;
+ pgoff_t pindex = 0;
+
+ wb_debug("->writepages on inode %lu (%u reserved)\n",
+ inode->i_ino, EXT4_I(inode)->i_blocks_reserved);
+#ifdef EXT4_WB_SKIP_SMALL
+ if (wbc->nr_to_write <= 64 && wbc->sync_mode == WB_SYNC_NONE)
+ return 0;
+#endif
+ atomic_inc(&EXT4_I(inode)->i_wb_writers);
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_reqs);
+ atomic_add(wbc->nr_to_write, &EXT4_SB(inode->i_sb)->s_wb_nr_to_write);
+ if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions);
+#endif
+
+ /* skip opened-for-write small files
+ * XXX: what do we do if most of files hit the condition? */
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ atomic_read(&inode->i_writecount) &&
+ i_size_read(inode) <= 64*1024) {
+ return 0;
+ }
+
+ ext4_wb_init_control(&wc, mapping);
+
+ pagevec_init(&pvec, 0);
+ while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ lock_page(page);
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (wc.len && ext4_wb_can_merge(&wc, page->index) &&
+ wbc->nr_to_write <= 0) {
+ /*
+ * If we already exhausted blocks we got
+ * to write and new extent starts, stop
+ * writeback
+ */
+ unlock_page(page);
+ done = 1;
+ break;
+
+ }
+
+ if (!clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ set_page_writeback(page);
+ unlock_page(page);
+
+ if (wc.len == 0) {
+ wc.start = page->index;
+ wc.len = 1;
+ extents++;
+ } else if (ext4_wb_can_merge(&wc, page->index)) {
+ wc.len++;
+ } else {
+ /* end of current extent: flush it ... */
+#if 0
+ if (wc.len < 64 && wc.len > 0) {
+ printk("#%u: wow! short extent %d for flush on #%lu\n",
+ (unsigned) current->pid, wc.len, inode->i_ino);
+ printk("#%u: done = %d, nr_to_write %ld, sync = %d\n",
+ (unsigned) current->pid, done, wbc->nr_to_write,
+ wbc->sync_mode);
+ printk("#%u: written %d, extents %d\n",
+ (unsigned) current->pid, written, extents);
+ printk("#%u: cur %lu, prev %lu\n",
+ (unsigned) current->pid,
+ (unsigned long) page->index,
+ (unsigned long) pindex);
+ }
+#endif
+ err = ext4_wb_flush(&wc);
+ if (err) {
+ done = 1;
+ end_page_writeback(page);
+ break;
+ }
+
+ /* ... and start new one */
+ BUG_ON(!PageWriteback(page));
+ wc.start = page->index;
+ wc.len = 1;
+ extents++;
+ }
+
+ pindex = page->index;
+ err = ext4_wb_add_page(&wc, page);
+ if (err) {
+ done = 1;
+ end_page_writeback(page);
+ break;
+ }
+ written++;
+
+ wbc->nr_to_write--;
+#if 0
+ if ((--(wbc->nr_to_write) <= 0))
+ done = 1;
+#endif
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_congested);
+#endif
+ wbc->encountered_congestion = 1;
+ done = 1;
+ }
+ }
+ pagevec_release(&pvec);
+ }
+ if (!err) {
+#ifdef EXT4_WB_SKIP_SMALL
+ if (wc.len > 0 && wc.len < 64 && wbc->sync_mode == WB_SYNC_NONE) {
+ struct list_head *cur, *tmp;
+ list_for_each_safe(cur, tmp, &wc.list) {
+ struct ext4_wb_pages *wp;
+ wp = list_entry(cur, struct ext4_wb_pages, list);
+ for (i = wp->start; i < wp->num; i++) {
+ struct page *page = wp->pages[i];
+ BUG_ON(!PageWriteback(page));
+ end_page_writeback(page);
+ __set_page_dirty_nobuffers(page);
+ }
+ wbc->nr_to_write += i;
+ list_del(&wp->list);
+ kfree(wp);
+ }
+ } else
+#endif
+ ext4_wb_flush(&wc);
+ }
+
+ atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+ atomic_add(written, &EXT4_SB(inode->i_sb)->s_wb_blocks);
+ atomic_add(extents, &EXT4_SB(inode->i_sb)->s_wb_extents);
+#endif
+ return 0;
+}
+
+static void ext4_wb_clear_page(struct page *page, int from, int to)
+{
+ void *kaddr;
+
+ if (to < PAGE_CACHE_SIZE || from > 0) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (PAGE_CACHE_SIZE > to)
+ memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+ if (0 < from)
+ memset(kaddr, 0, from);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+int ext4_wb_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head bh, *bhw = &bh;
+ int err = 0;
+
+ wb_debug("prepare page %lu (%u-%u) for inode %lu\n",
+ page->index, from, to, page->mapping->host->i_ino);
+
+ /* if page is uptodate this means that ->prepare_write() has
+ * been called on page before and page is mapped to disk or
+ * we did reservation. page is protected and nobody can
+ * access it. hence, it safe to use page->private to pass
+ * flag that ->commit_write() has to reserve blocks. because
+ * an error may occur after ->prepare_write() we should not
+ * reserve block here. it's better to do in ->commit_write()
+ * when we're sure page is to be written */
+ page->private = 0;
+ if (!PageUptodate(page)) {
+ /* first write to this page */
+ bh.b_state = 0;
+ err = ext4_get_block(inode, page->index, bhw, 0);
+ if (err)
+ return err;
+ if (!buffer_mapped(bhw)) {
+ /* this block isn't allocated yet, reserve space */
+ wb_debug("reserve space for new block\n");
+ page->private = 1;
+ ext4_wb_clear_page(page, from, to);
+ ClearPageMappedToDisk(page);
+ } else {
+ /* block is already mapped, so no need to reserve */
+ BUG_ON(PagePrivate(page));
+ if (to - from < PAGE_CACHE_SIZE) {
+ wb_debug("read block %u\n",
+ (unsigned) bhw->b_blocknr);
+ set_bh_page(bhw, page, 0);
+ bhw->b_this_page = 0;
+ bhw->b_size = 1 << inode->i_blkbits;
+ atomic_set(&bhw->b_count, 1);
+ ll_rw_block(READ, 1, &bhw);
+ wait_on_buffer(bhw);
+ if (!buffer_uptodate(bhw))
+ return -EIO;
+ }
+ SetPageMappedToDisk(page);
+ }
+ } else if (!PageMappedToDisk(page) && !PagePrivate(page)) {
+ /* this page was a hole at time of mmap() calling
+ * now someone wants to modify it by sys_write() */
+ wb_debug("reserve block for hole\n");
+ page->private = 1;
+ }
+
+ return 0;
+}
+
+int ext4_wb_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ struct inode *inode = page->mapping->host;
+ int err = 0;
+
+ wb_debug("commit page %lu (%u-%u) for inode %lu\n",
+ page->index, from, to, inode->i_ino);
+
+ /* mark page private so that we get
+ * called to invalidate/release page */
+ SetPagePrivate(page);
+
+ if (!PageBooked(page) && !PageMappedToDisk(page)) {
+ /* ->prepare_write() observed that block for this
+ * page hasn't been allocated yet. there fore it
+ * asked to reserve block for later allocation */
+ BUG_ON(page->private == 0);
+ page->private = 0;
+ err = ext4_wb_reserve_space_page(page, 1);
+ if (err)
+ return err;
+ }
+
+ /* ok. block for this page is allocated already or it has
+ * been reserved succesfully. so, user may use it */
+ __set_page_dirty_nobuffers(page);
+
+ SetPageUptodate(page);
+
+ /* correct in-core size, on-disk size will
+ * be corrected upon allocation */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+
+ return err;
+}
+
+int ext4_wb_write_single_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct ext4_wb_control wc;
+ int err;
+
+ atomic_inc(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_single_pages);
+ if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions_sp);
+#endif
+
+ ext4_wb_init_control(&wc, page->mapping);
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+
+ wc.start = page->index;
+ wc.len = 1;
+
+ err = ext4_wb_add_page(&wc, page);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: cant add page at %s:%d - %d\n",
+ __FILE__, __LINE__, err);
+ end_page_writeback(page);
+ return err;
+ }
+ err = ext4_wb_flush(&wc);
+ atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+ return err;
+}
+
+int ext4_wb_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ void *kaddr;
+
+ wb_debug("writepage %lu from inode %lu\n", page->index, inode->i_ino);
+
+ /*
+ * FIXME: just to play ...
+ * If another thread is writing inode's data and the page
+ * hasn't space on a disk yet, leave it for that thread
+ */
+#if 1
+ if (atomic_read(&EXT4_I(page->mapping->host)->i_wb_writers)
+ && !PageMappedToDisk(page)) {
+ __set_page_dirty_nobuffers(page);
+ unlock_page(page);
+ return 0;
+ }
+#endif
+
+ /* we give up here if we're reentered, because
+ * it might be for a different filesystem */
+ if (ext4_journal_current_handle()) {
+ __set_page_dirty_nobuffers(page);
+ unlock_page(page);
+ return 0;
+ }
+
+ /* Is the page fully inside i_size? */
+ if (page->index < end_index)
+ return ext4_wb_write_single_page(page, wbc);
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index >= end_index + 1 || !offset) {
+ /*
+ * The page may have dirty, unmapped buffers. For example,
+ * they may have been added in ext4_writepage(). Make them
+ * freeable here, so the page does not leak.
+ */
+ ext4_wb_invalidatepage(page, 0);
+ unlock_page(page);
+ return 0; /* don't care */
+ }
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ return ext4_wb_write_single_page(page, wbc);
+}
+
+int ext4_wb_releasepage(struct page *page, gfp_t wait)
+{
+ wb_debug("release %sM%sR page %lu from inode %lu (wait %d)\n",
+ PageMappedToDisk(page) ? "" : "!",
+ PageBooked(page) ? "" : "!",
+ page->index, page->mapping->host->i_ino, wait);
+
+ if (PageWriteback(page))
+ return 0;
+
+ if (PagePrivate(page))
+ ClearPagePrivate(page);
+ return 0;
+}
+
+void ext4_wb_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ int ret = 0;
+
+ /* ->invalidatepage() is called when page is marked Private.
+ * for our page being Private mean that space has been
+ * reserved for this page and it is being truncated. so,
+ * it's time to drop reservation */
+ wb_debug("invalidate %sM%sR page %lu from inode %lu (offset %lu)\n",
+ PageMappedToDisk(page) ? "" : "!",
+ PageBooked(page) ? "" : "!",
+ page->index, inode->i_ino, offset);
+
+ if (offset == 0) {
+ if (PageBooked(page)) {
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_dropped);
+ ext4_wb_release_space(inode, 1, 0);
+ ext4_wb_drop_page_reservation(page);
+ }
+ ret = try_to_release_page(page, 0);
+ }
+ return;
+}
+
+int ext4_wb_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ struct inode *inode = mapping->host;
+ struct buffer_head bh, *bhw = &bh;
+ unsigned blocksize, length;
+ void *kaddr;
+ int err = 0;
+
+ wb_debug("partial truncate from %lu on page %lu from inode %lu\n",
+ (unsigned long) from, page->index, inode->i_ino);
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ /* if page isn't uptodate we have to check has it assigned block
+ * if it has then that block is to be read before memset() */
+ if (!PageUptodate(page)) {
+ BUG_ON(PageMappedToDisk(page));
+ bh.b_state = 0;
+ err = ext4_get_block(inode, page->index, bhw, 0);
+ if (err)
+ goto err_out;
+ BUG_ON(buffer_new(bhw));
+ if (buffer_mapped(bhw)) {
+ /* time to retrieve data from a disk */
+ wb_debug("read block %u for part.trunc on %lu\n",
+ (unsigned) bhw->b_blocknr, page->index);
+ set_bh_page(bhw, page, 0);
+ bhw->b_this_page = 0;
+ bhw->b_size = 1 << inode->i_blkbits;
+ atomic_set(&bhw->b_count, 1);
+ ll_rw_block(READ, 1, &bhw);
+ wait_on_buffer(bhw);
+ err = -EIO;
+ if (!buffer_uptodate(bhw))
+ goto err_out;
+ SetPageMappedToDisk(page);
+ } else {
+ wb_debug("zero page %lu (part.trunc)\n", page->index);
+ offset = 0;
+ length = blocksize;
+ }
+ }
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ SetPageUptodate(page);
+ __set_page_dirty_nobuffers(page);
+
+err_out:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+void ext4_wb_init(struct super_block *sb)
+{
+ if (!test_opt(sb, DELAYED_ALLOC))
+ return;
+
+ if (PAGE_CACHE_SHIFT != sb->s_blocksize_bits) {
+ printk(KERN_ERR "EXT4-fs: delayed allocation isn't"
+ "supported for PAGE_CACHE_SIZE != blocksize yet\n");
+ clear_opt (EXT4_SB(sb)->s_mount_opt, DELAYED_ALLOC);
+ return;
+ }
+ printk("EXT4-fs: delayed allocation enabled\n");
+}
+
+void ext4_wb_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt(sb, DELAYED_ALLOC))
+ return;
+
+#ifdef EXT4_WB_STATS
+ if (atomic_read(&sbi->s_wb_reqs) == 0)
+ return;
+
+ printk("EXT4-fs: writeback: %d blocks %d extents in %d reqs (%d ave)\n",
+ atomic_read(&sbi->s_wb_blocks),
+ atomic_read(&sbi->s_wb_extents),
+ atomic_read(&sbi->s_wb_reqs),
+ atomic_read(&sbi->s_wb_blocks) / atomic_read(&sbi->s_wb_reqs));
+ printk("EXT4-fs: writeback: %d nr_to_write, %d congestions, %d singles\n",
+ atomic_read(&sbi->s_wb_nr_to_write),
+ atomic_read(&sbi->s_wb_congested),
+ atomic_read(&sbi->s_wb_single_pages));
+ printk("EXT4-fs: writeback: %d collisions, %d single-page collisions\n",
+ atomic_read(&sbi->s_wb_collisions),
+ atomic_read(&sbi->s_wb_collisions_sp));
+ printk("EXT4-fs: writeback: %d allocated, %d dropped\n",
+ atomic_read(&sbi->s_wb_allocated),
+ atomic_read(&sbi->s_wb_dropped));
+#endif
+}
+
Index: linux-2.6.20-rc1/fs/ext4/file.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/file.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/file.c 2006-12-22 22:56:04.000000000 +0300
@@ -35,8 +35,8 @@ static int ext4_release_file (struct ino
{
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
- {
+ (atomic_read(&inode->i_writecount) == 1) &&
+ EXT4_I(inode)->i_blocks_reserved == 0) {
mutex_lock(&EXT4_I(inode)->truncate_mutex);
ext4_discard_reservation(inode);
mutex_unlock(&EXT4_I(inode)->truncate_mutex);
Index: linux-2.6.20-rc1/fs/ext4/inode.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/inode.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/inode.c 2006-12-22 22:56:04.000000000 +0300
@@ -943,7 +943,7 @@ out:

#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)

-static int ext4_get_block(struct inode *inode, sector_t iblock,
+int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
handle_t *handle = journal_current_handle();
@@ -1807,9 +1807,34 @@ static const struct address_space_operat
.releasepage = ext4_releasepage,
};

+static int ext4_wb_set_page_dirty(struct page *page)
+{
+ return __set_page_dirty_nobuffers(page);
+}
+
+static struct address_space_operations ext4_writeback_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_wb_writepage,
+ .writepages = ext4_wb_writepages,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_wb_prepare_write,
+ .commit_write = ext4_wb_commit_write,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_wb_invalidatepage,
+ .releasepage = ext4_wb_releasepage,
+ .set_page_dirty = ext4_wb_set_page_dirty,
+ .direct_IO = ext4_direct_IO,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (S_ISREG(inode->i_mode) &&
+ (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ test_opt(inode->i_sb, EXTENTS) &&
+ test_opt(inode->i_sb, DELAYED_ALLOC))
+ inode->i_mapping->a_ops = &ext4_writeback_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
@@ -1834,6 +1859,11 @@ int ext4_block_truncate_page(handle_t *h
int err = 0;
void *kaddr;

+ if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ test_opt(inode->i_sb, EXTENTS) &&
+ test_opt(inode->i_sb, DELAYED_ALLOC))
+ return ext4_wb_block_truncate_page(handle, page, mapping, from);
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/