Re: [PATCH] tmpfs: add support for an i_version counter

From: Andrew Morton
Date: Fri Sep 09 2022 - 17:03:57 EST


On Fri, 9 Sep 2022 09:00:31 -0400 Jeff Layton <jlayton@xxxxxxxxxx> wrote:

> NFSv4 mandates a change attribute to avoid problems with timestamp
> granularity, which Linux implements using the i_version counter. This is
> particularly important when the underlying filesystem is fast.
>
> Give tmpfs an i_version counter. Since it doesn't have to be persistent,
> we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion
> calls in the right places.
>
> Also, while there is no formal spec for xattrs, most implementations
> update the ctime on setxattr. Fix shmem_xattr_handler_set to update the
> ctime and bump the i_version appropriately.
>
> ...
>
> --- a/fs/posix_acl.c
> +++ b/fs/posix_acl.c
> @@ -24,6 +24,7 @@
> #include <linux/user_namespace.h>
> #include <linux/namei.h>
> #include <linux/mnt_idmapping.h>
> +#include <linux/iversion.h>
>
> static struct posix_acl **acl_by_type(struct inode *inode, int type)
> {
> @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
> }
>
> inode->i_ctime = current_time(inode);
> + if (IS_I_VERSION(inode))
> + inode_inc_iversion(inode);
> set_cached_acl(inode, type, acl);
> return 0;
> }

adds a kilobyte of text to shmem.o because the quite large
inode_maybe_inc_iversion() get inlined all over the place. Why oh why.

Is there any reason not to do the obvious?

--- a/include/linux/iversion.h~a
+++ a/include/linux/iversion.h
@@ -177,56 +177,7 @@ inode_set_iversion_queried(struct inode
I_VERSION_QUERIED);
}

-/**
- * inode_maybe_inc_iversion - increments i_version
- * @inode: inode with the i_version that should be updated
- * @force: increment the counter even if it's not necessary?
- *
- * Every time the inode is modified, the i_version field must be seen to have
- * changed by any observer.
- *
- * If "force" is set or the QUERIED flag is set, then ensure that we increment
- * the value, and clear the queried flag.
- *
- * In the common case where neither is set, then we can return "false" without
- * updating i_version.
- *
- * If this function returns false, and no other metadata has changed, then we
- * can avoid logging the metadata.
- */
-static inline bool
-inode_maybe_inc_iversion(struct inode *inode, bool force)
-{
- u64 cur, old, new;
-
- /*
- * The i_version field is not strictly ordered with any other inode
- * information, but the legacy inode_inc_iversion code used a spinlock
- * to serialize increments.
- *
- * Here, we add full memory barriers to ensure that any de-facto
- * ordering with other info is preserved.
- *
- * This barrier pairs with the barrier in inode_query_iversion()
- */
- smp_mb();
- cur = inode_peek_iversion_raw(inode);
- for (;;) {
- /* If flag is clear then we needn't do anything */
- if (!force && !(cur & I_VERSION_QUERIED))
- return false;
-
- /* Since lowest bit is flag, add 2 to avoid it */
- new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
-
- old = atomic64_cmpxchg(&inode->i_version, cur, new);
- if (likely(old == cur))
- break;
- cur = old;
- }
- return true;
-}
-
+bool inode_maybe_inc_iversion(struct inode *inode, bool force);

/**
* inode_inc_iversion - forcibly increment i_version
--- a/fs/libfs.c~a
+++ a/fs/libfs.c
@@ -15,6 +15,7 @@
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
@@ -1529,3 +1530,53 @@ void generic_set_encrypted_ci_d_ops(stru
#endif
}
EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+
+/**
+ * inode_maybe_inc_iversion - increments i_version
+ * @inode: inode with the i_version that should be updated
+ * @force: increment the counter even if it's not necessary?
+ *
+ * Every time the inode is modified, the i_version field must be seen to have
+ * changed by any observer.
+ *
+ * If "force" is set or the QUERIED flag is set, then ensure that we increment
+ * the value, and clear the queried flag.
+ *
+ * In the common case where neither is set, then we can return "false" without
+ * updating i_version.
+ *
+ * If this function returns false, and no other metadata has changed, then we
+ * can avoid logging the metadata.
+ */
+bool inode_maybe_inc_iversion(struct inode *inode, bool force)
+{
+ u64 cur, old, new;
+
+ /*
+ * The i_version field is not strictly ordered with any other inode
+ * information, but the legacy inode_inc_iversion code used a spinlock
+ * to serialize increments.
+ *
+ * Here, we add full memory barriers to ensure that any de-facto
+ * ordering with other info is preserved.
+ *
+ * This barrier pairs with the barrier in inode_query_iversion()
+ */
+ smp_mb();
+ cur = inode_peek_iversion_raw(inode);
+ for (;;) {
+ /* If flag is clear then we needn't do anything */
+ if (!force && !(cur & I_VERSION_QUERIED))
+ return false;
+
+ /* Since lowest bit is flag, add 2 to avoid it */
+ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
+
+ old = atomic64_cmpxchg(&inode->i_version, cur, new);
+ if (likely(old == cur))
+ break;
+ cur = old;
+ }
+ return true;
+}
+EXPORT_SYMBOL(inode_maybe_inc_iversion);
_