Re: [PATCH v3] fs/ext4: increase parallelism in updating ext4 orphan list

From: Jan Kara
Date: Wed Apr 30 2014 - 06:10:54 EST


On Thu 24-04-14 11:31:26, T Makphaibulchoke wrote:
> This patch allows more concurrency of updates of ext4 orphan list,
> while still maintaining the integrity of both the in memory and on
> disk orphan lists of each update.
>
> This is accomplished by using a mutex from an array of mutexes, indexed
> by hashing of an inode, to serialize orphan list updates of a single
> inode, allowing most prelimary work to be done prior to acquiring the mutex.
> The mutex is acuqired only during the update of both orphan lists,
> reducing its contention.
>
> Here are some of the benchmark results with the changes.
>
> On a 120 core machine,
Just for record I think we can just remove the hashed locks in your patch
and things should work - see patches I've submitted yesterday. But please
check whether you see similar performance numbers with them as I may have
missed some aspect of your patch.

Honza
>
> ---------------------------
> | | % increase |
> ---------------------------
> | alltests | 19.29 |
> ---------------------------
> | custom | 11.10 |
> ---------------------------
> | disk | 5.09 |
> ---------------------------
> | fserver | 12.47 |
> ---------------------------
> | new_dbase | 7.49 |
> ---------------------------
> | shared | 16.55 |
> ---------------------------
>
> On a 80 core machine,
>
> ---------------------------
> | | % increase |
> ---------------------------
> | alltests | 30.09 |
> ---------------------------
> | custom | 22.66 |
> ---------------------------
> | dbase | 3.28 |
> ---------------------------
> | disk | 3.15 |
> ---------------------------
> | fserver | 24.28 |
> ---------------------------
> | high_systime| 6.79 |
> ---------------------------
> | new_dbase | 4.63 |
> ---------------------------
> | new_fserver | 28.40 |
> ---------------------------
> | shared | 20.42 |
> ---------------------------
>
> On a 8 core machine:
>
> ---------------------------
> | | % increase |
> ---------------------------
> | fserver | 9.11 |
> ---------------------------
> | new_fserver | 3.45 |
> ---------------------------
>
> For Swingbench dss workload,
>
> -----------------------------------------------------------------------------
> | Users | 100 | 200 | 300 | 400 | 500 | 600 | 700 | 800 | 900 |
> -----------------------------------------------------------------------------
> | % improve- | 6.15 | 5.49 | 3.13 | 1.06 | 2.31 | 6.29 | 6.50 |-0.6 | 1.72 |
> | ment with | | | | | | | | | |
> | out using | | | | | | | | | |
> | /dev/shm | | | | | | | | | |
> -----------------------------------------------------------------------------
>
> Signed-off-by: T. Makphaibulchoke <tmac@xxxxxx>
> ---
> Changed in v3:
> - Changed patch description
> - Reverted back to using a single mutex, s_ondisk_orphan_mutex, for
> both on disk and in memory orphan lists serialization.
>
> Changed in v2:
> - New performance data
> - Fixed problem in v1
> - No changes in ext4_inode_info size
> - Used an array of mutexes, indexed by hashing of an inode, to serialize
> operations within a single inode
> - Combined multiple patches into one.
> ---
> fs/ext4/ext4.h | 4 +-
> fs/ext4/namei.c | 126 ++++++++++++++++++++++++++++++++++----------------------
> fs/ext4/super.c | 11 ++++-
> 3 files changed, 90 insertions(+), 51 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d3a534f..a348f7c 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -77,6 +77,7 @@
> #define EXT4_ERROR_FILE(file, block, fmt, a...) \
> ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
>
> +#define EXT4_ORPHAN_OP_BITS 7
> /* data type for block offset of block group */
> typedef int ext4_grpblk_t;
>
> @@ -1223,7 +1224,8 @@ struct ext4_sb_info {
> /* Journaling */
> struct journal_s *s_journal;
> struct list_head s_orphan;
> - struct mutex s_orphan_lock;
> + struct mutex s_ondisk_orphan_lock;
> + struct mutex *s_orphan_op_mutex;
> unsigned long s_resize_flags; /* Flags indicating if there
> is a resizer */
> unsigned long s_commit_interval;
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index d050e04..b062e1e 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -48,6 +48,13 @@
> #define NAMEI_RA_BLOCKS 4
> #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
>
> +#define ORPHAN_OP_INDEX(ext4_i) \
> + (hash_long((unsigned long)ext4_i, EXT4_ORPHAN_OP_BITS))
> +#define LOCK_EXT4I_ORPHAN(ext4_sb, ext4_i) \
> + mutex_lock(&ext4_sb->s_orphan_op_mutex[ORPHAN_OP_INDEX(ext4_i)])
> +#define UNLOCK_EXT4I_ORPHAN(ext4_sb, ext4_i) \
> + mutex_unlock(&ext4_sb->s_orphan_op_mutex[ORPHAN_OP_INDEX(ext4_i)])
> +
> static struct buffer_head *ext4_append(handle_t *handle,
> struct inode *inode,
> ext4_lblk_t *block)
> @@ -2556,8 +2563,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> if (!EXT4_SB(sb)->s_journal)
> return 0;
>
> - mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
> - if (!list_empty(&EXT4_I(inode)->i_orphan))
> + LOCK_EXT4I_ORPHAN(EXT4_SB(sb), EXT4_I(inode));
> + if (!list_empty_careful(&EXT4_I(inode)->i_orphan))
> goto out_unlock;
>
> /*
> @@ -2582,9 +2589,20 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> * orphan list. If so skip on-disk list modification.
> */
> if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
> - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
> - goto mem_insert;
> -
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
> + brelse(iloc.bh);
> + mutex_lock(&EXT4_SB(sb)->s_ondisk_orphan_lock);
> + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->
> + s_orphan);
> + mutex_unlock(&EXT4_SB(sb)->s_ondisk_orphan_lock);
> + jbd_debug(4, "superblock will point to %lu\n",
> + inode->i_ino);
> + jbd_debug(4, "orphan inode %lu will point to %d\n",
> + inode->i_ino, NEXT_ORPHAN(inode));
> + goto out_unlock;
> + }
> +
> + mutex_lock(&EXT4_SB(sb)->s_ondisk_orphan_lock);
> /* Insert this inode at the head of the on-disk orphan list... */
> NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
> EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
> @@ -2592,24 +2610,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
> rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
> if (!err)
> err = rc;
> -
> - /* Only add to the head of the in-memory list if all the
> - * previous operations succeeded. If the orphan_add is going to
> - * fail (possibly taking the journal offline), we can't risk
> - * leaving the inode on the orphan list: stray orphan-list
> - * entries can cause panics at unmount time.
> - *
> - * This is safe: on error we're going to ignore the orphan list
> - * anyway on the next recovery. */
> -mem_insert:
> - if (!err)
> + if (!err) {
> + /* Only add to the head of the in-memory list if all the
> + * previous operations succeeded. If the orphan_add is going to
> + * fail (possibly taking the journal offline), we can't risk
> + * leaving the inode on the orphan list: stray orphan-list
> + * entries can cause panics at unmount time.
> + *
> + * This is safe: on error we're going to ignore the orphan list
> + * anyway on the next recovery. */
> list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
> -
> - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
> - jbd_debug(4, "orphan inode %lu will point to %d\n",
> + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
> + jbd_debug(4, "orphan inode %lu will point to %d\n",
> inode->i_ino, NEXT_ORPHAN(inode));
> + }
> + mutex_unlock(&EXT4_SB(sb)->s_ondisk_orphan_lock);
> +
> out_unlock:
> - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
> + UNLOCK_EXT4I_ORPHAN(EXT4_SB(sb), EXT4_I(inode));
> ext4_std_error(inode->i_sb, err);
> return err;
> }
> @@ -2622,45 +2640,54 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
> {
> struct list_head *prev;
> struct ext4_inode_info *ei = EXT4_I(inode);
> - struct ext4_sb_info *sbi;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> __u32 ino_next;
> struct ext4_iloc iloc;
> int err = 0;
>
> - if ((!EXT4_SB(inode->i_sb)->s_journal) &&
> - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
> + if ((!sbi->s_journal) &&
> + !(sbi->s_mount_state & EXT4_ORPHAN_FS))
> return 0;
>
> - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> - if (list_empty(&ei->i_orphan))
> - goto out;
> -
> - ino_next = NEXT_ORPHAN(inode);
> - prev = ei->i_orphan.prev;
> - sbi = EXT4_SB(inode->i_sb);
> -
> - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
> + LOCK_EXT4I_ORPHAN(sbi, ei);
> + if (list_empty_careful(&ei->i_orphan)) {
> + UNLOCK_EXT4I_ORPHAN(sbi, ei);
> + return 0;
> + }
>
> - list_del_init(&ei->i_orphan);
>
> /* If we're on an error path, we may not have a valid
> * transaction handle with which to update the orphan list on
> * disk, but we still need to remove the inode from the linked
> * list in memory. */
> - if (!handle)
> - goto out;
> + if (!handle) {
> + jbd_debug(4, "remove inode %lu from orphan list\n",
> + inode->i_ino);
> + mutex_lock(&sbi->s_ondisk_orphan_lock);
> + list_del_init(&ei->i_orphan);
> + mutex_unlock(&sbi->s_ondisk_orphan_lock);
> + } else
> + err = ext4_reserve_inode_write(handle, inode, &iloc);
>
> - err = ext4_reserve_inode_write(handle, inode, &iloc);
> - if (err)
> - goto out_err;
> + if (!handle || err) {
> + UNLOCK_EXT4I_ORPHAN(sbi, ei);
> + goto out_set_err;
> + }
>
> + mutex_lock(&sbi->s_ondisk_orphan_lock);
> + ino_next = NEXT_ORPHAN(inode);
> + prev = ei->i_orphan.prev;
> + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
> + list_del_init(&ei->i_orphan);
> if (prev == &sbi->s_orphan) {
> jbd_debug(4, "superblock will point to %u\n", ino_next);
> BUFFER_TRACE(sbi->s_sbh, "get_write_access");
> err = ext4_journal_get_write_access(handle, sbi->s_sbh);
> + if (!err)
> + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> + mutex_unlock(&sbi->s_ondisk_orphan_lock);
> if (err)
> - goto out_brelse;
> - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
> + goto brelse;
> err = ext4_handle_dirty_super(handle, inode->i_sb);
> } else {
> struct ext4_iloc iloc2;
> @@ -2670,25 +2697,26 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
> jbd_debug(4, "orphan inode %lu will point to %u\n",
> i_prev->i_ino, ino_next);
> err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
> + if (!err)
> + NEXT_ORPHAN(i_prev) = ino_next;
> + mutex_unlock(&sbi->s_ondisk_orphan_lock);
> if (err)
> - goto out_brelse;
> - NEXT_ORPHAN(i_prev) = ino_next;
> + goto brelse;
> err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
> }
> if (err)
> - goto out_brelse;
> + goto brelse;
> NEXT_ORPHAN(inode) = 0;
> + UNLOCK_EXT4I_ORPHAN(sbi, ei);
> err = ext4_mark_iloc_dirty(handle, inode, &iloc);
> -
> -out_err:
> +out_set_err:
> ext4_std_error(inode->i_sb, err);
> -out:
> - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
> return err;
>
> -out_brelse:
> +brelse:
> + UNLOCK_EXT4I_ORPHAN(sbi, ei);
> brelse(iloc.bh);
> - goto out_err;
> + goto out_set_err;
> }
>
> static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 710fed2..a4275d1 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3921,7 +3921,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
>
> INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
> - mutex_init(&sbi->s_orphan_lock);
> + mutex_init(&sbi->s_ondisk_orphan_lock);
> + sbi->s_orphan_op_mutex = kmalloc(sizeof(struct mutex) <<
> + EXT4_ORPHAN_OP_BITS, GFP_KERNEL);
> + BUG_ON(!sbi->s_orphan_op_mutex);
> + if (sbi->s_orphan_op_mutex) {
> + int n = 1 << EXT4_ORPHAN_OP_BITS;
> +
> + while (n-- > 0)
> + mutex_init(&sbi->s_orphan_op_mutex[n]);
> + }
>
> sb->s_root = NULL;
>
> --
> 1.7.11.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/