Re: [PATCH 1/7] writeback: move dirty inodes from super_block to backing_dev_info

From: Jan Kara
Date: Tue Mar 24 2009 - 12:18:11 EST


> This is a first step at introducing per-bdi flusher threads. We should
> have no change in behaviour, although sb_has_dirty_inodes() is now
> ridiculously expensive, as there's no easy way to answer that question.
> Not a huge problem, since it'll be deleted in subsequent patches.
Could you maybe expand the changelog a bit? If I read the patch right
the only thing it does is that it moves from per-sb inode lists to
per-bdi inode lists, right? Also sync_sb_inodes() now writes all the
inodes in the system, not just the ones for that superblock, doesn't it?

Honza
>
> Signed-off-by: Jens Axboe <jens.axboe@xxxxxxxxxx>
> ---
> fs/fs-writeback.c | 186 +++++++++++++++++++++++++++----------------
> fs/super.c | 3 -
> include/linux/backing-dev.h | 9 ++
> include/linux/fs.h | 5 +-
> mm/backing-dev.c | 31 +++++++-
> mm/page-writeback.c | 1 -
> 6 files changed, 156 insertions(+), 79 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index e5eaa62..c107cff 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -25,6 +25,7 @@
> #include <linux/buffer_head.h>
> #include "internal.h"
>
> +#define inode_to_bdi(inode) (inode)->i_mapping->backing_dev_info
>
> /**
> * writeback_acquire - attempt to get exclusive writeback access to a device
> @@ -158,12 +159,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> goto out;
>
> /*
> - * If the inode was already on s_dirty/s_io/s_more_io, don't
> - * reposition it (that would break s_dirty time-ordering).
> + * If the inode was already on b_dirty/b_io/b_more_io, don't
> + * reposition it (that would break b_dirty time-ordering).
> */
> if (!was_dirty) {
> inode->dirtied_when = jiffies;
> - list_move(&inode->i_list, &sb->s_dirty);
> + list_move(&inode->i_list,
> + &inode_to_bdi(inode)->b_dirty);
> }
> }
> out:
> @@ -184,31 +186,30 @@ static int write_inode(struct inode *inode, int sync)
> * furthest end of its superblock's dirty-inode list.
> *
> * Before stamping the inode's ->dirtied_when, we check to see whether it is
> - * already the most-recently-dirtied inode on the s_dirty list. If that is
> + * already the most-recently-dirtied inode on the b_dirty list. If that is
> * the case then the inode must have been redirtied while it was being written
> * out and we don't reset its dirtied_when.
> */
> static void redirty_tail(struct inode *inode)
> {
> - struct super_block *sb = inode->i_sb;
> + struct backing_dev_info *bdi = inode_to_bdi(inode);
>
> - if (!list_empty(&sb->s_dirty)) {
> - struct inode *tail_inode;
> + if (!list_empty(&bdi->b_dirty)) {
> + struct inode *tail;
>
> - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
> - if (!time_after_eq(inode->dirtied_when,
> - tail_inode->dirtied_when))
> + tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
> + if (!time_after_eq(inode->dirtied_when, tail->dirtied_when))
> inode->dirtied_when = jiffies;
> }
> - list_move(&inode->i_list, &sb->s_dirty);
> + list_move(&inode->i_list, &bdi->b_dirty);
> }
>
> /*
> - * requeue inode for re-scanning after sb->s_io list is exhausted.
> + * requeue inode for re-scanning after bdi->b_io list is exhausted.
> */
> static void requeue_io(struct inode *inode)
> {
> - list_move(&inode->i_list, &inode->i_sb->s_more_io);
> + list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
> }
>
> static void inode_sync_complete(struct inode *inode)
> @@ -240,18 +241,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
> /*
> * Queue all expired dirty inodes for io, eldest first.
> */
> -static void queue_io(struct super_block *sb,
> - unsigned long *older_than_this)
> +static void queue_io(struct backing_dev_info *bdi,
> + unsigned long *older_than_this)
> {
> - list_splice_init(&sb->s_more_io, sb->s_io.prev);
> - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
> + list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
> + move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
> +}
> +
> +static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
> +{
> + struct inode *inode;
> + int ret = 0;
> +
> + spin_lock(&inode_lock);
> + list_for_each_entry(inode, list, i_list) {
> + if (inode->i_sb == sb) {
> + ret = 1;
> + break;
> + }
> + }
> + spin_unlock(&inode_lock);
> + return ret;
> }
>
> int sb_has_dirty_inodes(struct super_block *sb)
> {
> - return !list_empty(&sb->s_dirty) ||
> - !list_empty(&sb->s_io) ||
> - !list_empty(&sb->s_more_io);
> + struct backing_dev_info *bdi;
> + int ret = 0;
> +
> + /*
> + * This is REALLY expensive right now, but it'll go away
> + * when the bdi writeback is introduced
> + */
> + rcu_read_lock();
> + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
> + if (sb_on_inode_list(sb, &bdi->b_dirty) ||
> + sb_on_inode_list(sb, &bdi->b_io) ||
> + sb_on_inode_list(sb, &bdi->b_more_io)) {
> + ret = 1;
> + break;
> + }
> + }
> + rcu_read_unlock();
> +
> + return ret;
> }
> EXPORT_SYMBOL(sb_has_dirty_inodes);
>
> @@ -305,11 +338,11 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
> /*
> * We didn't write back all the pages. nfs_writepages()
> * sometimes bales out without doing anything. Redirty
> - * the inode; Move it from s_io onto s_more_io/s_dirty.
> + * the inode; Move it from b_io onto b_more_io/b_dirty.
> */
> /*
> * akpm: if the caller was the kupdate function we put
> - * this inode at the head of s_dirty so it gets first
> + * this inode at the head of b_dirty so it gets first
> * consideration. Otherwise, move it to the tail, for
> * the reasons described there. I'm not really sure
> * how much sense this makes. Presumably I had a good
> @@ -319,7 +352,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
> if (wbc->for_kupdate) {
> /*
> * For the kupdate function we move the inode
> - * to s_more_io so it will get more writeout as
> + * to b_more_io so it will get more writeout as
> * soon as the queue becomes uncongested.
> */
> inode->i_state |= I_DIRTY_PAGES;
> @@ -385,10 +418,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
> /*
> * We're skipping this inode because it's locked, and we're not
> - * doing writeback-for-data-integrity. Move it to s_more_io so
> - * that writeback can proceed with the other inodes on s_io.
> + * doing writeback-for-data-integrity. Move it to b_more_io so
> + * that writeback can proceed with the other inodes on b_io.
> * We'll have another go at writing back this inode when we
> - * completed a full scan of s_io.
> + * completed a full scan of b_io.
> */
> requeue_io(inode);
> return 0;
> @@ -411,51 +444,24 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> return __sync_single_inode(inode, wbc);
> }
>
> -/*
> - * Write out a superblock's list of dirty inodes. A wait will be performed
> - * upon no inodes, all inodes or the final one, depending upon sync_mode.
> - *
> - * If older_than_this is non-NULL, then only write out inodes which
> - * had their first dirtying at a time earlier than *older_than_this.
> - *
> - * If we're a pdlfush thread, then implement pdflush collision avoidance
> - * against the entire list.
> - *
> - * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> - * This function assumes that the blockdev superblock's inodes are backed by
> - * a variety of queues, so all inodes are searched. For other superblocks,
> - * assume that all inodes are backed by the same queue.
> - *
> - * FIXME: this linear search could get expensive with many fileystems. But
> - * how to fix? We need to go from an address_space to all inodes which share
> - * a queue with that address_space. (Easy: have a global "dirty superblocks"
> - * list).
> - *
> - * The inodes to be written are parked on sb->s_io. They are moved back onto
> - * sb->s_dirty as they are selected for writing. This way, none can be missed
> - * on the writer throttling path, and we get decent balancing between many
> - * throttled threads: we don't want them all piling up on inode_sync_wait.
> - */
> -void generic_sync_sb_inodes(struct super_block *sb,
> - struct writeback_control *wbc)
> +static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
> + struct writeback_control *wbc,
> + int is_blkdev)
> {
> const unsigned long start = jiffies; /* livelock avoidance */
> - int sync = wbc->sync_mode == WB_SYNC_ALL;
>
> spin_lock(&inode_lock);
> - if (!wbc->for_kupdate || list_empty(&sb->s_io))
> - queue_io(sb, wbc->older_than_this);
> + if (!wbc->for_kupdate || list_empty(&bdi->b_io))
> + queue_io(bdi, wbc->older_than_this);
>
> - while (!list_empty(&sb->s_io)) {
> - struct inode *inode = list_entry(sb->s_io.prev,
> + while (!list_empty(&bdi->b_io)) {
> + struct inode *inode = list_entry(bdi->b_io.prev,
> struct inode, i_list);
> - struct address_space *mapping = inode->i_mapping;
> - struct backing_dev_info *bdi = mapping->backing_dev_info;
> long pages_skipped;
>
> if (!bdi_cap_writeback_dirty(bdi)) {
> redirty_tail(inode);
> - if (sb_is_blkdev_sb(sb)) {
> + if (is_blkdev) {
> /*
> * Dirty memory-backed blockdev: the ramdisk
> * driver does this. Skip just this inode
> @@ -472,14 +478,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
>
> if (wbc->nonblocking && bdi_write_congested(bdi)) {
> wbc->encountered_congestion = 1;
> - if (!sb_is_blkdev_sb(sb))
> + if (!is_blkdev)
> break; /* Skip a congested fs */
> requeue_io(inode);
> continue; /* Skip a congested blockdev */
> }
>
> if (wbc->bdi && bdi != wbc->bdi) {
> - if (!sb_is_blkdev_sb(sb))
> + if (!is_blkdev)
> break; /* fs has the wrong queue */
> requeue_io(inode);
> continue; /* blockdev has wrong queue */
> @@ -514,13 +520,55 @@ void generic_sync_sb_inodes(struct super_block *sb,
> wbc->more_io = 1;
> break;
> }
> - if (!list_empty(&sb->s_more_io))
> + if (!list_empty(&bdi->b_more_io))
> wbc->more_io = 1;
> }
>
> - if (sync) {
> + spin_unlock(&inode_lock);
> + /* Leave any unwritten inodes on b_io */
> +}
> +
> +/*
> + * Write out a superblock's list of dirty inodes. A wait will be performed
> + * upon no inodes, all inodes or the final one, depending upon sync_mode.
> + *
> + * If older_than_this is non-NULL, then only write out inodes which
> + * had their first dirtying at a time earlier than *older_than_this.
> + *
> + * If we're a pdlfush thread, then implement pdflush collision avoidance
> + * against the entire list.
> + *
> + * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> + * This function assumes that the blockdev superblock's inodes are backed by
> + * a variety of queues, so all inodes are searched. For other superblocks,
> + * assume that all inodes are backed by the same queue.
> + *
> + * FIXME: this linear search could get expensive with many fileystems. But
> + * how to fix? We need to go from an address_space to all inodes which share
> + * a queue with that address_space. (Easy: have a global "dirty superblocks"
> + * list).
> + *
> + * The inodes to be written are parked on bdi->b_io. They are moved back onto
> + * bdi->b_dirty as they are selected for writing. This way, none can be missed
> + * on the writer throttling path, and we get decent balancing between many
> + * throttled threads: we don't want them all piling up on inode_sync_wait.
> + */
> +void generic_sync_sb_inodes(struct super_block *sb,
> + struct writeback_control *wbc)
> +{
> + const int is_blkdev = sb_is_blkdev_sb(sb);
> + struct backing_dev_info *bdi;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
> + generic_sync_bdi_inodes(bdi, wbc, is_blkdev);
> + rcu_read_unlock();
> +
> + if (wbc->sync_mode == WB_SYNC_ALL) {
> struct inode *inode, *old_inode = NULL;
>
> + spin_lock(&inode_lock);
> +
> /*
> * Data integrity sync. Must wait for all pages under writeback,
> * because there may have been pages dirtied before our sync
> @@ -557,10 +605,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
> }
> spin_unlock(&inode_lock);
> iput(old_inode);
> - } else
> - spin_unlock(&inode_lock);
> + }
>
> - return; /* Leave any unwritten inodes on s_io */
> }
> EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
>
> @@ -575,8 +621,8 @@ static void sync_sb_inodes(struct super_block *sb,
> *
> * Note:
> * We don't need to grab a reference to superblock here. If it has non-empty
> - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
> - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
> + * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
> + * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
> * empty. Since __sync_single_inode() regains inode_lock before it finally moves
> * inode from superblock lists we are OK.
> *
> diff --git a/fs/super.c b/fs/super.c
> index 8349ed6..e3c5b6f 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -64,9 +64,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
> s = NULL;
> goto out;
> }
> - INIT_LIST_HEAD(&s->s_dirty);
> - INIT_LIST_HEAD(&s->s_io);
> - INIT_LIST_HEAD(&s->s_more_io);
> INIT_LIST_HEAD(&s->s_files);
> INIT_LIST_HEAD(&s->s_instances);
> INIT_HLIST_HEAD(&s->s_anon);
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index bee52ab..bb58c95 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -40,6 +40,8 @@ enum bdi_stat_item {
> #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
>
> struct backing_dev_info {
> + struct list_head bdi_list;
> +
> unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
> unsigned long state; /* Always use atomic bitops on this */
> unsigned int capabilities; /* Device capabilities */
> @@ -58,6 +60,10 @@ struct backing_dev_info {
>
> struct device *dev;
>
> + struct list_head b_dirty; /* dirty inodes */
> + struct list_head b_io; /* parked for writeback */
> + struct list_head b_more_io; /* parked for more writeback */
> +
> #ifdef CONFIG_DEBUG_FS
> struct dentry *debug_dir;
> struct dentry *debug_stats;
> @@ -72,6 +78,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
> int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
> void bdi_unregister(struct backing_dev_info *bdi);
>
> +extern spinlock_t bdi_lock;
> +extern struct list_head bdi_list;
> +
> static inline void __add_bdi_stat(struct backing_dev_info *bdi,
> enum bdi_stat_item item, s64 amount)
> {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 92734c0..3c90eb4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -648,7 +648,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
>
> struct inode {
> struct hlist_node i_hash;
> - struct list_head i_list;
> + struct list_head i_list; /* backing dev IO list */
> struct list_head i_sb_list;
> struct list_head i_dentry;
> unsigned long i_ino;
> @@ -1155,9 +1155,6 @@ struct super_block {
> struct xattr_handler **s_xattr;
>
> struct list_head s_inodes; /* all inodes */
> - struct list_head s_dirty; /* dirty inodes */
> - struct list_head s_io; /* parked for writeback */
> - struct list_head s_more_io; /* parked for more writeback */
> struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
> struct list_head s_files;
> /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 8e85874..cf1528b 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -7,8 +7,9 @@
> #include <linux/writeback.h>
> #include <linux/device.h>
>
> -
> static struct class *bdi_class;
> +DEFINE_SPINLOCK(bdi_lock);
> +LIST_HEAD(bdi_list);
>
> #ifdef CONFIG_DEBUG_FS
> #include <linux/debugfs.h>
> @@ -187,6 +188,10 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
> goto exit;
> }
>
> + spin_lock(&bdi_lock);
> + list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
> + spin_unlock(&bdi_lock);
> +
> bdi->dev = dev;
> bdi_debug_register(bdi, dev_name(dev));
>
> @@ -201,9 +206,23 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
> }
> EXPORT_SYMBOL(bdi_register_dev);
>
> +static void bdi_remove_from_list(struct backing_dev_info *bdi)
> +{
> + spin_lock(&bdi_lock);
> + list_del_rcu(&bdi->bdi_list);
> + spin_unlock(&bdi_lock);
> +
> + /*
> + * In case the bdi is freed right after unregister, we need to
> + * make sure any RCU sections have exited
> + */
> + synchronize_rcu();
> +}
> +
> void bdi_unregister(struct backing_dev_info *bdi)
> {
> if (bdi->dev) {
> + bdi_remove_from_list(bdi);
> bdi_debug_unregister(bdi);
> device_unregister(bdi->dev);
> bdi->dev = NULL;
> @@ -221,6 +240,10 @@ int bdi_init(struct backing_dev_info *bdi)
> bdi->min_ratio = 0;
> bdi->max_ratio = 100;
> bdi->max_prop_frac = PROP_FRAC_BASE;
> + INIT_LIST_HEAD(&bdi->bdi_list);
> + INIT_LIST_HEAD(&bdi->b_io);
> + INIT_LIST_HEAD(&bdi->b_dirty);
> + INIT_LIST_HEAD(&bdi->b_more_io);
>
> for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
> err = percpu_counter_init(&bdi->bdi_stat[i], 0);
> @@ -235,6 +258,8 @@ int bdi_init(struct backing_dev_info *bdi)
> err:
> while (i--)
> percpu_counter_destroy(&bdi->bdi_stat[i]);
> +
> + bdi_remove_from_list(bdi);
> }
>
> return err;
> @@ -245,6 +270,10 @@ void bdi_destroy(struct backing_dev_info *bdi)
> {
> int i;
>
> + WARN_ON(!list_empty(&bdi->b_dirty));
> + WARN_ON(!list_empty(&bdi->b_io));
> + WARN_ON(!list_empty(&bdi->b_more_io));
> +
> bdi_unregister(bdi);
>
> for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 74dc57c..3ec11d8 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -319,7 +319,6 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
> /*
> *
> */
> -static DEFINE_SPINLOCK(bdi_lock);
> static unsigned int bdi_min_ratio;
>
> int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
> --
> 1.6.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Jan Kara <jack@xxxxxxx>
SuSE CR Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/