Re: [PATCH 1/3] ext4: Add EXT4_IOC_TRUNCATE_BLOCK_RANGE ioctl

From: Dmitry Monakhov
Date: Sun Jun 23 2013 - 05:30:38 EST


On Sun, 23 Jun 2013 15:07:36 +0900, Namjae Jeon <linkinjeon@xxxxxxxxx> wrote:
> From: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
What is the difference between this ioctl and generic punch_hole?
>
> The EXT4_IOC_TRUNCATE_BLOCK_RANGE removes the data blocks lying
> between [start, "start + length") and updates the logical block numbers
> of data blocks starting from "start + length" block to last block of file.
> This will maintain contiguous nature of logical block numbers
> after block removal.
> Both the inode's disksize and logical size are updated after block
> removal
>
> Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
> Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
> ---
> fs/ext4/ext4.h | 8 ++
> fs/ext4/ext4_extents.h | 3 +
> fs/ext4/extents.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/ioctl.c | 62 ++++++++++++
> 4 files changed, 318 insertions(+)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6ed348d..df2c411 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -590,6 +590,7 @@ enum {
> #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
> #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
> #define EXT4_IOC_SWAP_BOOT _IO('f', 17)
> +#define EXT4_IOC_TRUNCATE_BLOCK_RANGE _IOW('f', 18, struct truncate_range)
>
> #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
> /*
> @@ -682,6 +683,11 @@ struct move_extent {
> __u64 moved_len; /* moved block length */
> };
>
> +struct truncate_range {
> + __u32 start_block;
> + __u32 length;
> +};
> +
> #define EXT4_EPOCH_BITS 2
> #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
> #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)
> @@ -2692,6 +2698,8 @@ extern int ext4_find_delalloc_range(struct inode *inode,
> extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
> extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> __u64 start, __u64 len);
> +extern int ext4_ext_truncate_range(struct inode *inode, ext4_lblk_t start,
> + ext4_lblk_t end, ext4_lblk_t last_block);
>
>
> /* move_extent.c */
> diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
> index 51bc821..cc113cc 100644
> --- a/fs/ext4/ext4_extents.h
> +++ b/fs/ext4/ext4_extents.h
> @@ -178,6 +178,9 @@ struct ext4_ext_path {
> #define EXT_MAX_INDEX(__hdr__) \
> (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
>
> +#define EXTENT_START_FLAG 0x1
> +#define INDEX_START_FLAG 0x2
> +
> static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
> {
> return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 937593e..ed85e34 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4757,3 +4757,248 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>
> return error;
> }
> +
> +/*
> + * ext4_trange_dirty_path: Function to mark the path buffer dirty.
> + * It also checks if there are sufficient credits left in the
> + * journal to update metadata. If the number of credits are less
> + * restart the handle with additional credits.
> + *
> + * @handle: journal handle
> + * @inode: file inode
> + * @path: pointer to path
> + * @num: number of inodes to be updated
> + *
> + * Returns: 0 on success or negative value on error
> + */
> +int ext4_trange_dirty_path(handle_t *handle, struct inode *inode,
> + struct ext4_ext_path *path,
> + int num, ...)
> +{
> + int credits, err, i;
> + struct inode *iptr;
> + va_list args;
> +
> + /*
> + * Check if need to extend journal credits
> + * 3 for leaf, sb, and inode plus 2 (bmap and group
> + * descriptor) for each block group; assume two block
> + * groups
> + */
> + if (handle->h_buffer_credits < 7*(num + 1)) {
> + credits = ext4_writepage_trans_blocks(inode);
> + va_start(args, num);
> + for (i = 1; i <= num; i++) {
> + iptr = va_arg(args, struct inode *);
> + credits += ext4_writepage_trans_blocks(iptr);
> + }
> + va_end(args);
> + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
> + /* EAGAIN is success */
> + if (err && err != -EAGAIN)
> + return err;
> + }
> + err = ext4_ext_get_access(handle, inode, path);
> + return err;
> +}
> +
> +/*
> + * ext4_ext_update_path: update the extents of a path structure
> + * lying between path[depth].p_ext and EXT_LAST_EXTENT(path[depth].p_hdr)
> + * subtracting shift from starting block for each extent.
> + *
> + * @path: path for which extents are updated
> + * @shift: Number of blocks to be subtracted from first logical block
> + * that extent covers for each extent.
> + * @inode: file inode
> + * @handle: journal handle
> + * @start_block: Points to the starting block of next extent which is
> + * to be updated.
> + *
> + * Returns: 0 on success or negative on error.
> + */
> +int ext4_ext_update_path(struct ext4_ext_path *path, ext4_lblk_t shift,
> + struct inode *inode, handle_t *handle,
> + ext4_lblk_t *start_block)
> +{
> + int depth, err = 0, flag = 0;
> + struct ext4_extent *ex_start, *ex_last;
> +
> + depth = path->p_depth;
> + while (depth >= 0) {
> + if (depth == path->p_depth) {
> + ex_start = path[depth].p_ext;
> + if (!ex_start)
> + return -EIO;
> +
> + err = ext4_trange_dirty_path(handle, inode,
> + path + depth, 0);
> + if (err)
> + goto out;
> +
> + if (path[depth].p_ext ==
> + EXT_FIRST_EXTENT(path[depth].p_hdr))
> + flag |= EXTENT_START_FLAG;
> +
> + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
> + while (ex_start <= ex_last) {
> + *start_block = ex_start->ee_block +
> + ext4_ext_get_actual_len(ex_start);
> + ex_start->ee_block -= shift;
> + ex_start++;
> + }
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + goto out;
> + } else {
> + /* If encountered starting extent, update index too */
> + if (path->p_depth - depth == 1) {
> + if (flag & EXTENT_START_FLAG) {
> + /* Update index too */
> + err = ext4_trange_dirty_path(handle,
> + inode, path + depth, 0);
> + if (err)
> + goto out;
> + path[depth].p_idx->ei_block -= shift;
> + err = ext4_ext_dirty(handle, inode,
> + path + depth);
> + if (err)
> + goto out;
> + flag &= ~EXTENT_START_FLAG;
> + } else
> + /* No need to update any extent index */
> + break;
> + }
> + /* Check, if earlier encountered starting index */
> + if (flag & INDEX_START_FLAG) {
> + err = ext4_trange_dirty_path(handle, inode,
> + path + (depth), 0);
> + if (err)
> + goto out;
> + path[depth].p_idx->ei_block -= shift;
> + err = ext4_ext_dirty(handle, inode,
> + path + depth);
> + if (err)
> + goto out;
> + flag &= ~INDEX_START_FLAG;
> + }
> + /* Check if this is a starting index */
> + if (path[depth].p_idx ==
> + EXT_FIRST_INDEX(path[depth].p_hdr)) {
> + /* starting of a block */
> + flag |= INDEX_START_FLAG;
> + } else
> + break;
> + }
> + depth--;
> + }
> +out:
> + return err;
> +}
> +
> +/*
> + * ext4_ext_update_logical: update logical blocks ranging from start
> + * to the end block for inode by moving them shift blocks to the left
> + *
> + * @inode: file inode
> + * @handle: journal handle
> + * @start_block : starting block for block updation
> + * @shift: number of blocks to be shifted
> + * @end_block: last block to be updated
> + *
> + * Returns: 0 on success or negative on failure
> + */
> +static int ext4_ext_update_logical(struct inode *inode, handle_t *handle,
> + ext4_lblk_t start_block, ext4_lblk_t shift,
> + ext4_lblk_t end_block)
> +{
> + struct ext4_ext_path *path;
> + int err = 0;
> +
> + while (start_block < end_block) {
> + path = ext4_ext_find_extent(inode, start_block, NULL);
> + if (IS_ERR(path)) {
> + err = PTR_ERR(path);
> + break;
> + }
> + err = ext4_ext_update_path(path, shift, inode,
> + handle, &start_block);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + if (err)
> + break;
> + }
> + return err;
> +}
> +
> +/*
> + * ext4_ext_truncate_range: truncate the block range from start
> + * block to end block including the end block from inode.
> + *
> + * @inode: file inode
> + * @start: start block
> + * @end: end block
> + * last_block: last_block number of the inode
> + *
> + * Returns: 0 on success or negative on error
> + */
> +int ext4_ext_truncate_range(struct inode *inode, ext4_lblk_t start,
> + ext4_lblk_t end, ext4_lblk_t last_block)
> +{
> + int ret, credits;
> + ext4_lblk_t shift = end - start + 1;
> + handle_t *handle;
> + loff_t isize_reduced;
> + int blkbits = inode->i_blkbits;
> + struct address_space *mapping = inode->i_mapping;
> +
> + /* sync dirty pages for transfer */
> + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> + ret = filemap_write_and_wait_range(mapping,
> + (loff_t)start << blkbits,
> + ((loff_t)(last_block + 1) << blkbits) - 1);
> + if (ret)
> + return ret;
> + }
> + truncate_inode_pages_range(inode->i_mapping,
> + start << inode->i_blkbits, -1);
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> + down_write(&EXT4_I(inode)->i_data_sem);
> + ext4_discard_preallocations(inode);
> + ret = ext4_es_remove_extent(inode, start, end - start + 1);
> + if (ret)
> + goto out;
> +
> + credits = ext4_writepage_trans_blocks(inode);
> + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out;
> + }
> +
> + ret = ext4_ext_remove_space(inode, start, end);
> + if (ret)
> + goto journal_stop;
> +
> + ext4_discard_preallocations(inode);
> +
> + if (end < last_block) {
> + ret = ext4_ext_update_logical(inode, handle, end + 1,
> + shift, last_block + 1);
> + if (ret)
> + goto journal_stop;
> + }
> + isize_reduced = (loff_t)shift << blkbits;
> + i_size_write(inode, inode->i_size - isize_reduced);
> + EXT4_I(inode)->i_disksize -= isize_reduced;
> + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_mark_inode_dirty(handle, inode);
> +journal_stop:
> + ext4_journal_stop(handle);
> +out:
> + ext4_inode_resume_unlocked_dio(inode);
> + up_write(&EXT4_I(inode)->i_data_sem);
> + return ret;
> +}
> +
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index 9491ac0..0530daf 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -622,6 +622,68 @@ resizefs_out:
>
> return 0;
> }
> + case EXT4_IOC_TRUNCATE_BLOCK_RANGE:
> + {
> + struct truncate_range tr;
> + ext4_lblk_t last_block, end_block;
> + int error;
> + loff_t i_size = i_size_read(inode);
> +
> + if (!i_size)
> + return 0;
> +
> + if (!(filp->f_mode & FMODE_WRITE))
> + return -EBADF;
> +
> + if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
> + return -EPERM;
> +
> + if (!S_ISREG(inode->i_mode))
> + return -EOPNOTSUPP;
> +
> + if (IS_SWAPFILE(inode))
> + return -EOPNOTSUPP;
> +
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> + return -EOPNOTSUPP;
> +
> + if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
> + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
> + ext4_msg(sb, KERN_ERR,
> + "Truncate block range not supported with bigalloc");
> + return -EOPNOTSUPP;
> + }
> +
> + if (copy_from_user(&tr, (const void *) arg,
> + sizeof(struct truncate_range)))
> + return -EFAULT;
> +
> + if (!tr.length)
> + return -EINVAL;
> +
> + end_block = tr.start_block + tr.length - 1;
> +
> + last_block = ((round_up(i_size,
> + EXT4_BLOCK_SIZE(inode->i_sb)))
> + >> inode->i_blkbits) - 1;
> + if (tr.start_block > end_block ||
> + tr.start_block > last_block)
> + return -EINVAL;
> +
> + if (end_block > last_block)
> + end_block = last_block;
> +
> + error = mnt_want_write_file(filp);
> + if (error)
> + return error;
> +
> + mutex_lock(&inode->i_mutex);
> + error = ext4_ext_truncate_range(inode, tr.start_block,
> + end_block, last_block);
> + mutex_unlock(&inode->i_mutex);
> + mnt_drop_write_file(filp);
> + return error;
> + }
>
> default:
> return -ENOTTY;
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/