Re: [PATCH] mm: Improve cmtime update on shared writable mmaps

From: Jan Kara
Date: Tue Nov 01 2011 - 18:53:53 EST


On Fri 28-10-11 16:39:25, Andy Lutomirski wrote:
> We used to update a file's time on do_wp_page. This caused latency
> whenever file_update_time would sleep (this happens on ext4). It is
> also, IMO, less than ideal: any copy, backup, or 'make' run taken
> after do_wp_page but before an mmap user finished writing would see
> the new timestamp but the old contents.
>
> With this patch, cmtime is updated after a page is written. When the
> mm code transfers the dirty bit from a pte to the associated struct
> page, it also sets a new page flag indicating that the page was
> modified directly from userspace. The inode's time is then updated in
> clear_page_dirty_for_io.
>
> We can't update cmtime in all contexts in which ptes are unmapped:
> various reclaim paths can unmap ptes from GFP_NOFS paths.
>
> Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> ---
>
> I'm not thrilled about using a page flag for this, but I haven't
> spotted a better way. Updating the time in writepage would touch
> a lot of files and would interact oddly with write.
I see two problems with this patch:
1) Using a page flags is really a no-go. We are rather short on page flags
so using them for such minor thing is a real wastage. Moreover it should be
rather easy to just use an inode flag instead.

2) You cannot call inode_update_time_writable() from
clear_page_dirty_for_io() because that is called under a page lock and thus
would create lock inversion problems.

Honza
>
> fs/inode.c | 51 ++++++++++++++++++++++++++++++-------------
> include/linux/fs.h | 1 +
> include/linux/page-flags.h | 5 ++++
> mm/page-writeback.c | 19 +++++++++++++---
> mm/rmap.c | 18 +++++++++++++-
> 5 files changed, 72 insertions(+), 22 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index ec79246..ee93a25 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1461,21 +1461,8 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
> }
> EXPORT_SYMBOL(touch_atime);
>
> -/**
> - * file_update_time - update mtime and ctime time
> - * @file: file accessed
> - *
> - * Update the mtime and ctime members of an inode and mark the inode
> - * for writeback. Note that this function is meant exclusively for
> - * usage in the file write path of filesystems, and filesystems may
> - * choose to explicitly ignore update via this function with the
> - * S_NOCMTIME inode flag, e.g. for network filesystem where these
> - * timestamps are handled by the server.
> - */
> -
> -void file_update_time(struct file *file)
> +static void do_inode_update_time(struct file *file, struct inode *inode)
> {
> - struct inode *inode = file->f_path.dentry->d_inode;
> struct timespec now;
> enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
>
> @@ -1497,7 +1484,7 @@ void file_update_time(struct file *file)
> return;
>
> /* Finally allowed to write? Takes lock. */
> - if (mnt_want_write_file(file))
> + if (file && mnt_want_write_file(file))
> return;
>
> /* Only change inode inside the lock region */
> @@ -1508,10 +1495,42 @@ void file_update_time(struct file *file)
> if (sync_it & S_MTIME)
> inode->i_mtime = now;
> mark_inode_dirty_sync(inode);
> - mnt_drop_write(file->f_path.mnt);
> +
> + if (file)
> + mnt_drop_write(file->f_path.mnt);
> +}
> +
> +/**
> + * file_update_time - update mtime and ctime time
> + * @file: file accessed
> + *
> + * Update the mtime and ctime members of an inode and mark the inode
> + * for writeback. Note that this function is meant exclusively for
> + * usage in the file write path of filesystems, and filesystems may
> + * choose to explicitly ignore update via this function with the
> + * S_NOCMTIME inode flag, e.g. for network filesystem where these
> + * timestamps are handled by the server.
> + */
> +
> +void file_update_time(struct file *file)
> +{
> + do_inode_update_time(file, file->f_path.dentry->d_inode);
> }
> EXPORT_SYMBOL(file_update_time);
>
> +/**
> + * inode_update_time_writable - update mtime and ctime
> + * @inode: inode accessed
> + *
> + * Same as file_update_time, except that the caller is responsible
> + * for checking that the mount is writable.
> + */
> +
> +void inode_update_time_writable(struct inode *inode)
> +{
> + do_inode_update_time(0, inode);
> +}
> +
> int inode_needs_sync(struct inode *inode)
> {
> if (IS_SYNC(inode))
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 277f497..9e28927 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2553,6 +2553,7 @@ extern int inode_newsize_ok(const struct inode *, loff_t offset);
> extern void setattr_copy(struct inode *inode, const struct iattr *attr);
>
> extern void file_update_time(struct file *file);
> +extern void inode_update_time_writable(struct inode *inode);
>
> extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
> extern void save_mount_options(struct super_block *sb, char *options);
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e90a673..4eed012 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -107,6 +107,7 @@ enum pageflags {
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> PG_compound_lock,
> #endif
> + PG_update_cmtime, /* Dirtied via writable mapping. */
> __NR_PAGEFLAGS,
>
> /* Filesystems */
> @@ -273,6 +274,10 @@ PAGEFLAG_FALSE(HWPoison)
> #define __PG_HWPOISON 0
> #endif
>
> +/* Whoever clears PG_update_cmtime must update the cmtime. */
> +SETPAGEFLAG(UpdateCMTime, update_cmtime)
> +TESTCLEARFLAG(UpdateCMTime, update_cmtime)
> +
> u64 stable_page_flags(struct page *page);
>
> static inline int PageUptodate(struct page *page)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0e309cd..41c48ea 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1460,7 +1460,8 @@ EXPORT_SYMBOL(set_page_dirty_lock);
>
> /*
> * Clear a page's dirty flag, while caring for dirty memory accounting.
> - * Returns true if the page was previously dirty.
> + * Returns true if the page was previously dirty. Also updates inode time
> + * if necessary.
> *
> * This is for preparing to put the page under writeout. We leave the page
> * tagged as dirty in the radix tree so that a concurrent write-for-sync
> @@ -1474,6 +1475,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
> */
> int clear_page_dirty_for_io(struct page *page)
> {
> + int ret;
> struct address_space *mapping = page_mapping(page);
>
> BUG_ON(!PageLocked(page));
> @@ -1520,11 +1522,20 @@ int clear_page_dirty_for_io(struct page *page)
> dec_zone_page_state(page, NR_FILE_DIRTY);
> dec_bdi_stat(mapping->backing_dev_info,
> BDI_RECLAIMABLE);
> - return 1;
> + ret = 1;
> + goto out;
> }
> - return 0;
> + ret = 0;
> + goto out;
> }
> - return TestClearPageDirty(page);
> + ret = TestClearPageDirty(page);
> +
> +out:
> + /* We know that the inode (if any) is on a writable mount. */
> + if (mapping && mapping->host && TestClearPageUpdateCMTime(page))
> + inode_update_time_writable(mapping->host);
> +
> + return ret;
> }
> EXPORT_SYMBOL(clear_page_dirty_for_io);
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 8005080..2ee595d 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -937,6 +937,16 @@ int page_mkclean(struct page *page)
> struct address_space *mapping = page_mapping(page);
> if (mapping) {
> ret = page_mkclean_file(mapping, page);
> +
> + /*
> + * If dirtied via shared writable mapping, cmtime
> + * needs to be updated. If dirtied only through
> + * write(), etc, then the writer already updated
> + * cmtime.
> + */
> + if (ret)
> + SetPageUpdateCMTime(page);
> +
> if (page_test_and_clear_dirty(page_to_pfn(page), 1))
> ret = 1;
> }
> @@ -1203,8 +1213,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> pteval = ptep_clear_flush_notify(vma, address, pte);
>
> /* Move the dirty bit to the physical page now the pte is gone. */
> - if (pte_dirty(pteval))
> + if (pte_dirty(pteval)) {
> + SetPageUpdateCMTime(page);
> set_page_dirty(page);
> + }
>
> /* Update high watermark before we lower rss */
> update_hiwater_rss(mm);
> @@ -1388,8 +1400,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
> set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
>
> /* Move the dirty bit to the physical page now the pte is gone. */
> - if (pte_dirty(pteval))
> + if (pte_dirty(pteval)) {
> + SetPageUpdateCMTime(page);
> set_page_dirty(page);
> + }
>
> page_remove_rmap(page);
> page_cache_release(page);
> --
> 1.7.6.4
>
--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/