Re: [PATCH V5 06/12] fs: Add locking for a dynamic address space operations state

From: Dave Chinner
Date: Sun Mar 01 2020 - 20:36:12 EST


On Mon, Mar 02, 2020 at 12:26:44PM +1100, Dave Chinner wrote:
> > /*
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 1784478270e1..3a7863ba51b9 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -2293,6 +2293,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> > * and return. Otherwise fallthrough to buffered io for
> > * the rest of the read. Buffered reads will not work for
> > * DAX files, so don't bother trying.
> > + *
> > + * IS_DAX is protected under ->read_iter lock
> > */
> > if (retval < 0 || !count || iocb->ki_pos >= size ||
> > IS_DAX(inode))
>
> This check is in the DIO path, be we can't do DIO on DAX enabled
> files to begin with, so we can only get here if S_DAX is not set on
> the file.
>
> Further, if IOCB_DIRECT is set, neither ext4 nor XFS call
> generic_file_read_iter(); they run the iomap_dio_rw() path directly
> instead. Only ext2 calls generic_file_read_iter() to do direct IO,
> so it's the only filesystem that needs this IS_DAX() check in it.
>
> I think we should fix ext2 to be implemented like ext4 and XFS -
> they implement the buffered IO fallback, should it be required,
> themselves and never use generic_file_read_iter() for direct IO.
>
> That would allow us to add this to generic_file_read_iter():
>
> if (WARN_ON_ONCE(IS_DAX(inode))
> return -EINVAL;
>
> to indicate that this should never be called directly on a DAX
> capable filesystem. This places all the responsibility for managing
> DAX behaviour on the filesystem, which then allows us to reason more
> solidly about how the filesystem IO paths use and check the S_DAX
> flag.
>
> i.e. changing the on-disk flag already locks out the filesystem IO
> path via the i_rwsem(), and all the filesystem IO paths (buffered,
> direct IO and dax) are serialised by this flag. Hence we can check
> once in the filesystem path once we have the i_rwsem held and
> know that S_DAX will not change until we release it.
>
> ..... and now I realise what I was sitting on the fence about....
>
> I don't like the aops locking in call_read/write_iter() because it
> is actually redundant: the filesystem should be doing the necessary
> locking in the IO path via the i_rwsem to prevent S_DAX from
> changing while it is doing the IO.
>
> IOWs, we need to restructure the locking inside the filesystem
> read_iter and write_iter methods so that the i_rwsem protects the
> S_DAX flag from changing dynamically. They all do:
>
> if (dax)
> do_dax_io()
> if (direct)
> do_direct_io()
> do_buffered_io()
>
> And then we take the i_rwsem inside each of those functions and do
> the IO. What we actually need to do is something like this:
>
> inode_lock_shared()
> if (dax)
> do_dax_io()
> if (direct)
> do_direct_io()
> do_buffered_io()
> inode_unlock_shared()
>
> And remove the inode locking from inside the individual IO methods
> themselves. It's a bit more complex than this because buffered
> writes require exclusive locking, but this completely removes the
> need for holding an aops lock over these methods.
>
> I've attached a couple of untested patches (I've compiled them, so
> they must be good!) to demonstrate what I mean for the XFS IO path.
> The read side removes a heap of duplicate code, but the write side
> is .... unfortunately complex. Have to think about that more.

And now with patches...

Cheers,

Dave.
--
Dave Chinner
david@xxxxxxxxxxxxx
xfs: pulling read IO path locking up to protect S_DAX checks

From: Dave Chinner <dchinner@xxxxxxxxxx>

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
fs/xfs/xfs_file.c | 77 +++++++++++++++++--------------------------------------
1 file changed, 24 insertions(+), 53 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..7d31a0e76b68 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -176,28 +176,12 @@ xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
- ssize_t ret;
-
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
-
- if (!count)
- return 0; /* skip atime */
+ trace_xfs_file_direct_read(XFS_I(file_inode(iocb->ki_filp)),
+ iov_iter_count(to), iocb->ki_pos);

file_accessed(iocb->ki_filp);
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
+ return iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
is_sync_kiocb(iocb));
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- return ret;
}

static noinline ssize_t
@@ -205,27 +189,12 @@ xfs_file_dax_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
- ssize_t ret = 0;
-
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
-
- if (!count)
- return 0; /* skip atime */
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
- ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ trace_xfs_file_dax_read(XFS_I(file_inode(iocb->ki_filp)),
+ iov_iter_count(to), iocb->ki_pos);

file_accessed(iocb->ki_filp);
- return ret;
+ return dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
+
}

STATIC ssize_t
@@ -233,21 +202,10 @@ xfs_file_buffered_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- ssize_t ret;
-
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(XFS_I(file_inode(iocb->ki_filp)),
+ iov_iter_count(to), iocb->ki_pos);

- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = generic_file_read_iter(iocb, to);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- return ret;
+ return generic_file_read_iter(iocb, to);
}

STATIC ssize_t
@@ -256,7 +214,8 @@ xfs_file_read_iter(
struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;

XFS_STATS_INC(mp, xs_read_calls);
@@ -264,6 +223,16 @@ xfs_file_read_iter(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;

+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
+
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
@@ -271,6 +240,8 @@ xfs_file_read_iter(
else
ret = xfs_file_buffered_aio_read(iocb, to);

+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
return ret;
xfs: pull write IO locking up to protect S_DAX

From: Dave Chinner <dchinner@xxxxxxxxxx>

Much more complex than the read side because of all the lock
juggling we for the different types of writes and all the various
metadata updates a write might need prior to dispatching the data
IO.

Not really happy about the way the high level logic turned out;
could probably be make cleaner and smarter with a bit more thought.

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
fs/xfs/xfs_file.c | 205 ++++++++++++++++++++++++++++++------------------------
1 file changed, 115 insertions(+), 90 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7d31a0e76b68..b7c2bb09b945 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -250,9 +250,9 @@ xfs_file_read_iter(
/*
* Common pre-write limit and setup checks.
*
- * Called with the iolocked held either shared and exclusive according to
- * @iolock, and returns with it held. Might upgrade the iolock to exclusive
- * if called for a direct write beyond i_size.
+ * Called without the iolocked held, but will lock it according to @iolock, and
+ * returns with it held on both success and error. Might upgrade the iolock to
+ * exclusive if called for a direct write beyond i_size.
*/
STATIC ssize_t
xfs_file_aio_write_checks(
@@ -268,6 +268,13 @@ xfs_file_aio_write_checks(
bool drained_dio = false;
loff_t isize;

+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, *iolock))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, *iolock);
+ }
+
restart:
error = generic_write_checks(iocb, from);
if (error <= 0)
@@ -325,7 +332,7 @@ xfs_file_aio_write_checks(
drained_dio = true;
goto restart;
}
-
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
NULL, &xfs_buffered_write_iomap_ops);
@@ -449,19 +456,14 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
* Returns with locks held indicated by @iolock and errors indicated by
* negative return values.
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static int
+xfs_file_dio_write_lock_mode(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
size_t count = iov_iter_count(from);
struct xfs_buftarg *target = xfs_inode_buftarg(ip);

@@ -478,35 +480,37 @@ xfs_file_dio_aio_write(
*/
if ((iocb->ki_pos & mp->m_blockmask) ||
((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* unaligned dio always waits, bail */
+ return -EAGAIN;
+ }

/*
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+ trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos,
+ count);
return -EREMCHG;
}
- iolock = XFS_IOLOCK_EXCL;
- } else {
- iolock = XFS_IOLOCK_SHARED;
- }
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+ return XFS_IOLOCK_EXCL;
}
+ return XFS_IOLOCK_SHARED;
+}

- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
- if (ret)
- goto out;
- count = iov_iter_count(from);
+STATIC ssize_t
+xfs_file_dio_aio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ int iolock)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ bool unaligned_io = 0;
+ size_t count = iov_iter_count(from);

/*
* If we are doing unaligned IO, we can't allow any other overlapping IO
@@ -515,7 +519,9 @@ xfs_file_dio_aio_write(
* iolock if we had to take the exclusive lock in
* xfs_file_aio_write_checks() for other reasons.
*/
- if (unaligned_io) {
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ unaligned_io = true;
inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
@@ -530,7 +536,6 @@ xfs_file_dio_aio_write(
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io);
-out:
xfs_iunlock(ip, iolock);

/*
@@ -544,26 +549,15 @@ xfs_file_dio_aio_write(
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ int iolock)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
size_t count;
loff_t pos;

- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
- if (ret)
- goto out;
-
pos = iocb->ki_pos;
count = iov_iter_count(from);

@@ -573,7 +567,6 @@ xfs_file_dax_write(
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
-out:
xfs_iunlock(ip, iolock);
if (error)
return error;
@@ -590,26 +583,13 @@ xfs_file_dax_write(
STATIC ssize_t
xfs_file_buffered_aio_write(
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ int iolock,
+ bool flush_on_enospc)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
- int iolock;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
-
-write_retry:
- iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
- if (ret)
- goto out;

/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -617,8 +597,6 @@ xfs_file_buffered_aio_write(
trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
- if (likely(ret >= 0))
- iocb->ki_pos += ret;

/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -629,34 +607,35 @@ xfs_file_buffered_aio_write(
* also behaves as a filter to prevent too many eofblocks scans from
* running at the same time.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && flush_on_enospc) {
+ int made_progress;
+
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
+ made_progress = xfs_inode_free_quota_eofblocks(ip);
+ if (made_progress)
+ return -EAGAIN;
+ made_progress = xfs_inode_free_quota_cowblocks(ip);
+ if (made_progress)
+ return -EAGAIN;
+ return ret;
+ }
+ if (ret == -ENOSPC && flush_on_enospc) {
struct xfs_eofblocks eofb = {0};

- enospc = 1;
xfs_flush_inodes(ip->i_mount);

xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
xfs_icache_free_cowblocks(ip->i_mount, &eofb);
- goto write_retry;
+ return -EAGAIN;
}

current->backing_dev_info = NULL;
-out:
- if (iolock)
- xfs_iunlock(ip, iolock);
-
+ xfs_iunlock(ip, iolock);
if (ret > 0) {
+ iocb->ki_pos += ret;
+
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
ret = generic_write_sync(iocb, ret);
@@ -675,6 +654,8 @@ xfs_file_write_iter(
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
+ int iolock;
+ bool flush_on_enospc = true;

XFS_STATS_INC(ip->i_mount, xs_write_calls);

@@ -684,22 +665,66 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;

- if (IS_DAX(inode))
- return xfs_file_dax_write(iocb, from);
-
- if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Set up the initial locking state - only direct IO uses shared locking
+ * for writes, and buffered IO does not support IOCB_NOWAIT.
+ */
+relock:
+ if (IS_DAX(inode)) {
+ iolock = XFS_IOLOCK_EXCL;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
* write *only* in the case that we're doing a reflink
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
- if (ret != -EREMCHG)
- return ret;
+ iolock = xfs_file_dio_write_lock_mode(iocb, from);
+ if (iolock == -EREMCHG) {
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ iolock = XFS_IOLOCK_EXCL;
+ }
+ if (iolock < 0)
+ return iolock;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* buffered writes do not support IOCB_NOWAIT */
+ return -EOPNOTSUPP;
+ } else {
+ iolock = XFS_IOLOCK_EXCL;
}

- return xfs_file_buffered_aio_write(iocb, from);
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ if (ret) {
+ xfs_iunlock(ip, iolock);
+ return ret;
+ }
+
+ /*
+ * If the IO mode switched while we were locking meaning we hold
+ * the wrong lock type right now, start again.
+ */
+ if ((IS_DAX(inode) || !(iocb->ki_flags & IOCB_DIRECT)) &&
+ iolock != XFS_IOLOCK_EXCL) {
+ xfs_iunlock(ip, iolock);
+ goto relock;
+ }
+
+ if (IS_DAX(inode)) {
+ ret = xfs_file_dax_write(iocb, from, iolock);
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = xfs_file_dio_aio_write(iocb, from, iolock);
+ ASSERT(ret != -EREMCHG);
+ } else {
+ ret = xfs_file_buffered_aio_write(iocb, from, iolock,
+ flush_on_enospc);
+ if (ret == -EAGAIN && flush_on_enospc) {
+ /* inode already unlocked, but need another attempt */
+ flush_on_enospc = false;
+ goto relock;
+ }
+ ASSERT(ret != -EAGAIN);
+ }
+ return ret;
}

static void