[PATCH] btrfs: Fix memory ordering of unlocked dio reads vs truncate

From: Nikolay Borisov
Date: Wed Mar 07 2018 - 10:19:12 EST


Signed-off-by: Nikolay Borisov <nborisov@xxxxxxxx>
---
fs/btrfs/btrfs_inode.h | 17 -----------------
fs/btrfs/inode.c | 41 ++++++++++++++++++++++++++++++++++-------
2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4e12a477d32e..e84f58cca02e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -317,23 +317,6 @@ struct btrfs_dio_private {
blk_status_t);
};

-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d2bb58d277a..d64600268c3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4626,10 +4626,29 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)

truncate_setsize(inode, newsize);

- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
+ /*
+ * This code is very subtle. It is essentially a lock of its
+ * own type. BTRFS allows multiple DIO readers to race with
+ * writers so long as they don't read beyond EOF of an inode.
+ * However, if we have a pending truncate we'd like to signal
+ * DIO readers they should fall back to DIO_LOCKING semantics.
+ * This ensures that multiple aggressive DIO readers cannot
+ * starve the truncating thread.
+ *
+ * This semantics is achieved by the use of the below flag. If
+ * new readers come after the flag has been cleared then the
+ * state is still consistent, since the RELEASE semantics of
+ * clear_bit_unlock ensure the truncate inode size will be
+ * visible and DIO readers will bail out.
+ *
+ * The implied memory barrier by inode_dio_wait is paired with
+ * smp_mb__before_atomic in btrfs_direct_IO.
+ */
+ set_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags);
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
+ clear_bit_unlock(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags);

ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -8070,11 +8089,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ } else {
+ /*
+ * This barrier is paired with the implied barrier in
+ * inode_dio_wait. It ensures that READDIO_NEED_LOCK is
+ * visible if we have a pending truncate.
+ */
+ smp_mb__before_atomic();
+ if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
+ inode_dio_end(inode);
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ wakeup = false;
+ }
}

ret = __blockdev_direct_IO(iocb, inode,
--
2.17.1


--------------961A561BD5F9D1A717978A8F--