Re: [PATCH v8 1/2] fs: New zonefs file system

From: Dave Chinner
Date: Tue Jan 21 2020 - 20:58:10 EST


Hi Damien,

I've finally had a chance to recover from LCA, catch up and look at
this again. Overall, pretty good, but a few comments below....

On Tue, Jan 21, 2020 at 03:58:45PM +0900, Damien Le Moal wrote:
> zonefs is a very simple file system exposing each zone of a zoned block
> device as a file. Unlike a regular file system with zoned block device
> support (e.g. f2fs), zonefs does not hide the sequential write
> constraint of zoned block devices to the user. Files representing
> sequential write zones of the device must be written sequentially
> starting from the end of the file (append only writes).

....

> --- /dev/null
> +++ b/fs/zonefs/super.c
> @@ -0,0 +1,1178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Simple zone file system for zoned block devices.
> + *
> + * Copyright (C) 2019 Western Digital Corporation or its affiliates.
> + */
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/magic.h>
> +#include <linux/iomap.h>
> +#include <linux/init.h>
> +#include <linux/slab.h>
> +#include <linux/blkdev.h>
> +#include <linux/statfs.h>
> +#include <linux/writeback.h>
> +#include <linux/quotaops.h>
> +#include <linux/seq_file.h>
> +#include <linux/parser.h>
> +#include <linux/uio.h>
> +#include <linux/mman.h>
> +#include <linux/sched/mm.h>
> +#include <linux/crc32.h>
> +
> +#include "zonefs.h"
> +
> +static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> + unsigned int flags, struct iomap *iomap,
> + struct iomap *srcmap)
> +{
> + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + loff_t max_isize = zi->i_max_size;
> + loff_t isize;
> +
> + /*
> + * For sequential zones, enforce direct IO writes. This is already
> + * checked when writes are issued, so warn about this here if we
> + * get buffered write to a sequential file inode.
> + */
> + if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
> + (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
> + return -EIO;
> +
> + /*
> + * For all zones, all blocks are always mapped. For sequential zones,
> + * all blocks after the write pointer (inode size) are always unwritten.
> + */
> + mutex_lock(&zi->i_truncate_mutex);
> + isize = i_size_read(inode);
> + if (offset >= isize) {
> + length = min(length, max_isize - offset);
> + if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
> + iomap->type = IOMAP_MAPPED;
> + else
> + iomap->type = IOMAP_UNWRITTEN;
> + } else {
> + length = min(length, isize - offset);
> + iomap->type = IOMAP_MAPPED;
> + }

Something was bugging me about this, and reading the rest of the
patch it finally triggered. For conventional zones, inode->i_size =
zi->i_max_size, and so if offset >= isize for a conventional
zone then this:

length = min(length, max_isize - offset);

is going to result in length <= 0 and we return a negative length
iomap.

IOWs, this case should only trigger for IO into sequential zones,
as it appears to be prevented at higher layers for conventional
zones by explicit checks against i_max_size and/or
iov_iter_truncate() calls to ensure user IOs are limited to within
i_max_size.

Hence it looks to me that triggering the (offset >= isize) case here
for conventional zones is a WARN_ON_ONCE() and return -EIO
situation...

SO, perhaps:

isize = i_size_read(inode);
if (offset >= isize) {
if (WARN_ON_ONCE(i->i_ztype == ZONEFS_ZTYPE_CNV)) {
/* drop locks */
return -EIO;
}
length = min(length, max_isize - offset);
iomap->type = IOMAP_UNWRITTEN;
} else {
length = min(length, isize - offset);
iomap->type = IOMAP_MAPPED;
}

This also seems tailored around the call from zonefs_map_blocks()
which tries to map the entire zone (length = zi->i_max_size) for
writeback mappings. Hence the length in this case always requires
clamping to zi->i_max_size - offset. Again, there's an issue here:

> +static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
> + struct inode *inode, loff_t offset)
> +{
> + if (offset >= wpc->iomap.offset &&
> + offset < wpc->iomap.offset + wpc->iomap.length)
> + return 0;
> +
> + memset(&wpc->iomap, 0, sizeof(wpc->iomap));
> + return zonefs_iomap_begin(inode, offset, ZONEFS_I(inode)->i_max_size,
> + 0, &wpc->iomap, NULL);

Where we pass flags = 0 into zonefs_iomap_begin(), and so there is
no checking that this writeback code path is only executing against
a conventional zone. I.e. the comments and checks in
zonefs_iomap_begin() relate only to user IO call paths, but don't
validate or comment on the writeback path callers, and there's no
comments or checks here that the inode points at a conventional
zone, either....

> +static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
> +{
> + struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
> + vm_fault_t ret;
> +
> + down_read(&zi->i_mmap_sem);
> + ret = filemap_fault(vmf);
> + up_read(&zi->i_mmap_sem);
> +
> + return ret;
> +}
> +
> +static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
> +{
> + struct inode *inode = file_inode(vmf->vma->vm_file);
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + vm_fault_t ret;
> +
> + sb_start_pagefault(inode->i_sb);
> + file_update_time(vmf->vma->vm_file);
> +
> + /* Serialize against truncates */
> + down_read(&zi->i_mmap_sem);
> + ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
> + up_read(&zi->i_mmap_sem);
> +
> + sb_end_pagefault(inode->i_sb);
> + return ret;
> +}

Should there be a WARN_ON_ONCE(zi->zi_type != ZONEFS_ZTYPE_CNV) in
here?

> +static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + loff_t max_pos;
> + size_t count;
> + ssize_t ret;
> +
> + if (iocb->ki_pos >= zi->i_max_size)
> + return 0;
> +
> + if (iocb->ki_flags & IOCB_NOWAIT) {
> + if (!inode_trylock_shared(inode))
> + return -EAGAIN;
> + } else {
> + inode_lock_shared(inode);
> + }

We should really turn that into a generic helper. This pattern is
being replicated all over the place. Not in this patchset, though...

> +static int zonefs_report_zones_err_cb(struct blk_zone *zone, unsigned int idx,
> + void *data)
> +{
> + struct inode *inode = data;
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + loff_t pos;
> +
> + /*
> + * The condition of the zone may have change. Check it and adjust the
> + * inode information as needed, similarly to zonefs_init_file_inode().
> + */
> + if (zone->cond == BLK_ZONE_COND_OFFLINE) {
> + inode->i_flags |= S_IMMUTABLE;
> + inode->i_mode &= ~0777;
> + zone->wp = zone->start;
> + } else if (zone->cond == BLK_ZONE_COND_READONLY) {
> + inode->i_flags |= S_IMMUTABLE;
> + inode->i_mode &= ~0222;
> + }

This exact code is repeated in zonefs_init_file_inode(). Maybe it
should be a helper function?

> +
> + pos = (zone->wp - zone->start) << SECTOR_SHIFT;
> + zi->i_wpoffset = pos;
> + if (i_size_read(inode) != pos) {
> + zonefs_update_stats(inode, pos);
> + i_size_write(inode, pos);
> + }

What happens if this decreases the size of the zone? don't we need
to invalidate the page cache beyond the new EOF in this case (i.e.
it's a truncate operation)?

> +static int zonefs_seq_file_write_failed(struct inode *inode, int error)
> +{
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + struct super_block *sb = inode->i_sb;
> + sector_t sector = zi->i_zsector;
> + unsigned int nofs_flag;
> + int ret;
> +
> + zonefs_warn(sb, "Updating inode zone %llu info\n", sector);
> +
> + /*
> + * blkdev_report_zones() uses GFP_KERNEL by default. Force execution as
> + * if GFP_NOFS was specified so that it will not end up recursing into
> + * the FS on memory allocation.
> + */
> + nofs_flag = memalloc_nofs_save();
> + ret = blkdev_report_zones(sb->s_bdev, sector, 1,
> + zonefs_report_zones_err_cb, inode);
> + memalloc_nofs_restore(nofs_flag);

The comment is kinda redundant - it's explaining exactly what the
code does rather than why it needs this protection. i.e. the comment
should explain the recursion vector/deadlock that we are avoiding
here...

> +static int zonefs_file_dio_write_end(struct kiocb *iocb, ssize_t size, int ret,
> + unsigned int flags)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> +
> + if (ret)
> + return ret;
> +
> + /*
> + * Conventional zone file size is fixed to the zone size so there
> + * is no need to do anything.
> + */
> + if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
> + return 0;
> +
> + mutex_lock(&zi->i_truncate_mutex);
> +
> + if (size < 0) {
> + ret = zonefs_seq_file_write_failed(inode, size);

Ok, so I see it is being called from IO completion context, whcih
means we'd want memalloc_noio_save() because the underlying bio
doesn't get freed until this whole completion runs, right?

> +static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
> + struct zonefs_inode_info *zi = ZONEFS_I(inode);
> + size_t count;
> + ssize_t ret;
> +
> + if (iocb->ki_flags & IOCB_NOWAIT) {
> + if (!inode_trylock(inode))
> + return -EAGAIN;
> + } else {
> + inode_lock(inode);
> + }
> +
> + ret = generic_write_checks(iocb, from);
> + if (ret <= 0)
> + goto out;
> +
> + iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
> + count = iov_iter_count(from);

So count is initialised to the entire IO length....

> +
> + /*
> + * Direct writes must be aligned to the block size, that is, the device
> + * physical sector size, to avoid errors when writing sequential zones
> + * on 512e devices (512B logical sector, 4KB physical sectors).
> + */
> + if ((iocb->ki_pos | count) & sbi->s_blocksize_mask) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + /*
> + * Enforce sequential writes (append only) in sequential zones.
> + */
> + mutex_lock(&zi->i_truncate_mutex);
> + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
> + iocb->ki_pos != zi->i_wpoffset) {
> + zonefs_err(inode->i_sb,
> + "Unaligned write at %llu + %zu (wp %llu)\n",
> + iocb->ki_pos, count,
> + zi->i_wpoffset);
> + mutex_unlock(&zi->i_truncate_mutex);
> + ret = -EINVAL;
> + goto out;
> + }
> + mutex_unlock(&zi->i_truncate_mutex);
> +
> + ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, &zonefs_dio_ops,
> + is_sync_kiocb(iocb));
> + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
> + (ret > 0 || ret == -EIOCBQUEUED)) {
> + if (ret > 0)
> + count = ret;
> + mutex_lock(&zi->i_truncate_mutex);
> + zi->i_wpoffset += count;
> + mutex_unlock(&zi->i_truncate_mutex);

Hmmmm. This looks problematic w.r.t. AIO. If we get -EIOCBQUEUED it
means the IO has been queued but not necessarily submitted, but
we update zi->i_wpoffset as though the entire AIO has laready
completed. ANd then we drop the inode_lock() and return, allowing
another AIO+DIO to be started.

Hence another concurrent sequential AIO+DIO write could now be
submitted and pass the above iocb->ki_pos != zi->i_wpoffset check.
Now we have two independent IOs in flight - one that is at the
current hardware write pointer offset, and one that is beyond it.

What happens if the block layer now re-orders these two IOs?


> +static struct dentry *zonefs_create_inode(struct dentry *parent,
> + const char *name, struct blk_zone *zone)
> +{
> + struct inode *dir = d_inode(parent);
> + struct dentry *dentry;
> + struct inode *inode;
> +
> + dentry = d_alloc_name(parent, name);
> + if (!dentry)
> + return NULL;
> +
> + inode = new_inode(parent->d_sb);
> + if (!inode)
> + goto out;
> +
> + inode->i_ino = get_next_ino();

get_next_ino() doesn't guarantee inode number uniqueness (it's 32
bit and global across all filesystems so it can overflow). Are
duplicate inode numbers on this superblock an issue?

> +/*
> + * Read super block information from the device.
> + */
> +static int zonefs_read_super(struct super_block *sb)
> +{
> + struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
> + struct zonefs_super *super;
> + u32 crc, stored_crc;
> + struct page *page;
> + struct bio_vec bio_vec;
> + struct bio bio;
> + int ret;
> +
> + page = alloc_page(GFP_KERNEL);
> + if (!page)
> + return -ENOMEM;
> +
> + bio_init(&bio, &bio_vec, 1);
> + bio.bi_iter.bi_sector = 0;
> + bio_set_dev(&bio, sb->s_bdev);
> + bio_set_op_attrs(&bio, REQ_OP_READ, 0);
> + bio_add_page(&bio, page, PAGE_SIZE, 0);
> +
> + ret = submit_bio_wait(&bio);
> + if (ret)
> + goto out;
> +
> + super = page_address(page);
> +
> + stored_crc = le32_to_cpu(super->s_crc);
> + super->s_crc = 0;
> + crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super));
> + if (crc != stored_crc) {
> + zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
> + crc, stored_crc);
> + ret = -EIO;
> + goto out;
> + }

Does this mean if mount or the kernel tries to autoprobe the
filesystem type on a device it will get -EIO and an "Invalid
checksum" error message rather than just silently returning -EINVAL
because....

> + ret = -EINVAL;
> + if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
> + goto out;

... it isn't actually a zonefs filesystem?

i.e. shouldn't these checks be the other way around?

Cheers,

Dave.
--
Dave Chinner
david@xxxxxxxxxxxxx