Re: [PATCH v2] ceph: invalidate pages when doing DIO in encrypted inodes

From: Luís Henriques
Date: Thu Apr 07 2022 - 05:06:21 EST


Xiubo Li <xiubli@xxxxxxxxxx> writes:

> Hi Luis,
>
> Please try the following patch, to see could it resolve your issue:

No, this seems to deadlock when running my test. I'm attaching the code
I'm using to test it; it's part of generic/647 but I've removed all the
other test cases that were passing. I simply mount the filesystem with
test_dummy_encryption and run this test. With your patch it'll hang;
without it it'll show "pwrite (O_DIRECT) is broken". The extra
invalidate_inode_pages2_range() will make it pass.

Cheers,
--
Luís

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2021 Red Hat, Inc. All Rights Reserved.
* Written by Andreas Gruenbacher (agruenba@xxxxxxxxxx)
*/

/* Trigger page faults in the same file during read and write. */

#ifndef _GNU_SOURCE
#define _GNU_SOURCE /* to get definition of O_DIRECT flag. */
#endif

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <err.h>

char *filename;
unsigned int page_size;
void *page;
char *addr;
int fd;
ssize_t ret;

/*
* Leave a hole at the beginning of the test file and initialize a block of
* @page_size bytes at offset @page_size to @c. Then, reopen the file and
* mmap the first two pages.
*/
void init(char c, int flags)
{
fd = open(filename, O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, 0666);
if (fd == -1)
goto fail;
memset(page, c, page_size);
ret = pwrite(fd, page, page_size, page_size);
if (ret != page_size)
goto fail;
if (close(fd))
goto fail;

fd = open(filename, flags);
if (fd == -1)
goto fail;
addr = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED)
err(1, NULL);
return;

fail:
err(1, "%s", filename);
}

static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
ssize_t count2 = 0, ret;

do {
ret = pwrite(fd, buf, count, offset);
if (ret == -1) {
if (errno == EINTR)
continue;
break;
}
if (ret == 0)
break;
count2 += ret;
buf += ret;
count -= ret;
} while (count);
return count2;
}

int main(int argc, char *argv[])
{
if (argc != 2)
errx(1, "no test filename argument given");
filename = argv[1];

page_size = ret = sysconf(_SC_PAGE_SIZE);
if (ret == -1)
err(1, NULL);

ret = posix_memalign(&page, page_size, page_size);
if (ret) {
errno = ENOMEM;
err(1, NULL);
}
init('d', O_RDWR | O_DIRECT);
ret = do_write(fd, addr + page_size, page_size, 0);
if (ret != page_size)
err(1, "pwrite %s (O_DIRECT): %ld != %u", filename, ret, page_size);
if (memcmp(addr, page, page_size))
errx(1, "pwrite (O_DIRECT) is broken");

if (fsync(fd))
errx(1, "fsync");
if (close(fd))
errx(1, "close");

if (unlink(filename))
err(1, "unlink %s", filename);

return 0;
}

>
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 5d39d8e54273..3507e4066de4 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -2011,6 +2011,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct
> iov_iter *to)
>                      ceph_cap_string(got));
>
>                 if (ci->i_inline_version == CEPH_INLINE_NONE) {
> + filemap_invalidate_lock(inode->i_mapping);
>                         if (!retry_op &&
>                             (iocb->ki_flags & IOCB_DIRECT) &&
>                             !IS_ENCRYPTED(inode)) {
> @@ -2021,6 +2022,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct
> iov_iter *to)
>                         } else {
>                                 ret = ceph_sync_read(iocb, to, &retry_op);
>                         }
> + filemap_invalidate_unlock(inode->i_mapping);
>                 } else {
>                         retry_op = READ_INLINE;
>                 }
> @@ -2239,11 +2241,13 @@ static ssize_t ceph_write_iter(struct kiocb *iocb,
> struct iov_iter *from)
>
>                 /* we might need to revert back to that point */
>                 data = *from;
> +               filemap_invalidate_lock(inode->i_mapping);
>                 if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode))
>                         written = ceph_direct_read_write(iocb, &data, snapc,
> &prealloc_cf);
>                 else
>                         written = ceph_sync_write(iocb, &data, pos, snapc);
> +               filemap_invalidate_unlock(inode->i_mapping);
>                 if (direct_lock)
>                         ceph_end_io_direct(inode);
>                 else
>
>
>
> On 4/1/22 9:32 PM, Luís Henriques wrote:
>> When doing DIO on an encrypted node, we need to invalidate the page cache in
>> the range being written to, otherwise the cache will include invalid data.
>>
>> Signed-off-by: Luís Henriques <lhenriques@xxxxxxx>
>> ---
>> fs/ceph/file.c | 11 ++++++++++-
>> 1 file changed, 10 insertions(+), 1 deletion(-)
>>
>> Changes since v1:
>> - Replaced truncate_inode_pages_range() by invalidate_inode_pages2_range
>> - Call fscache_invalidate with FSCACHE_INVAL_DIO_WRITE if we're doing DIO
>>
>> Note: I'm not really sure this last change is required, it doesn't really
>> affect generic/647 result, but seems to be the most correct.
>>
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 5072570c2203..b2743c342305 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -1605,7 +1605,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>> if (ret < 0)
>> return ret;
>> - ceph_fscache_invalidate(inode, false);
>> + ceph_fscache_invalidate(inode, (iocb->ki_flags & IOCB_DIRECT));
>> ret = invalidate_inode_pages2_range(inode->i_mapping,
>> pos >> PAGE_SHIFT,
>> (pos + count - 1) >> PAGE_SHIFT);
>> @@ -1895,6 +1895,15 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>> req->r_inode = inode;
>> req->r_mtime = mtime;
>> + if (IS_ENCRYPTED(inode) && (iocb->ki_flags & IOCB_DIRECT)) {
>> + ret = invalidate_inode_pages2_range(
>> + inode->i_mapping,
>> + write_pos >> PAGE_SHIFT,
>> + (write_pos + write_len - 1) >> PAGE_SHIFT);
>> + if (ret < 0)
>> + dout("invalidate_inode_pages2_range returned %d\n", ret);
>> + }
>> +
>> /* Set up the assertion */
>> if (rmw) {
>> /*
>>
>