[RFC PATCH] vfs: rename: link replacement to source

From: Miklos Szeredi
Date: Fri Oct 25 2013 - 05:16:44 EST


I'm really reluctant to implement whiteouts in the filesystem, since its
basically a single-use code, and it would come with introducing a new
object type, adding support for this object to each operation, adding
special support to userspace API, etc... Instead we can really reuse
existing features of filesystems to implement whiteouts, with all the
kernel and userspace interface issues already sorted out
(e.g. symlink+xattr, as used by old and well used overlayfs code).

So the only operation remaining to allow atomic whiteouts is renaming to an
existing destination and whiteouting the source.

This patch adds an extension to renameat2 (which should now really be
renameat3), that allows this. It's basically an atomic equivalent to:

rename source dest
link replacement source

Unlike plain rename, this is always balanced (source and dest exist and
remain to exist after the operation). And it only involves locking two
directories. The only tricky part is having to lock both "dest" and
"replacement" inodes.

The syscall interface is now

int renameat2(srcdfd, srcname, dstdfd, dstname, replacefd, flags);

where replacefd can be any non-directory, possibly opened with O_PATH.

Not sure if we really need a dfd+name for the replacement argument, but
that's a possibility as well.

Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxx>
---
This is on top of the cross-rename patches posted previously. Git tree here:

git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git cross-rename

fs/ext4/ext4.h | 26 ++++++----
fs/ext4/namei.c | 46 +++++++++++++++---
fs/namei.c | 121 +++++++++++++++++++++++++++++++++++++++++++-----
include/linux/fs.h | 7 +-
include/uapi/linux/fs.h | 1
5 files changed, 171 insertions(+), 30 deletions(-)

--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3142,6 +3142,7 @@ static void ext4_rename_delete(handle_t
*/
static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
+ struct dentry *replace_src, struct dentry *replace_dst,
unsigned int flags)
{
handle_t *handle = NULL;
@@ -3155,6 +3156,23 @@ static int ext4_rename2(struct inode *ol
};
int retval;
bool overwrite = !(flags & RENAME_EXCHANGE);
+ struct inode *replace = NULL;
+ int nblocks = 2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2;
+
+ if (flags & ~(RENAME_EXCHANGE | RENAME_REPLACE))
+ return -EOPNOTSUPP;
+
+ if (flags & RENAME_REPLACE) {
+ replace = replace_src->d_inode;
+ if (replace->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ nblocks += EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1;
+ }
+ if (!overwrite)
+ nblocks += EXT4_INDEX_EXTRA_TRANS_BLOCKS;

dquot_initialize(old.dir);
dquot_initialize(new.dir);
@@ -3193,9 +3211,7 @@ static int ext4_rename2(struct inode *ol
goto end_rename;
}

- handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
- 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR, nblocks);
if (IS_ERR(handle))
return PTR_ERR(handle);

@@ -3233,7 +3249,7 @@ static int ext4_rename2(struct inode *ol
if (retval)
goto end_rename;
} else {
- u8 new_file_type = new.de->file_type;
+ u8 file_type = new.de->file_type;
retval = ext4_setent(handle, &new,
old.inode->i_ino, old.de->file_type);
if (retval)
@@ -3241,10 +3257,28 @@ static int ext4_rename2(struct inode *ol

if (!overwrite) {
retval = ext4_setent(handle, &old,
- new.inode->i_ino, new_file_type);
+ new.inode->i_ino, file_type);
if (retval)
goto end_rename;
}
+ if (replace) {
+ file_type = ext4_type_by_mode(replace->i_mode);
+ replace->i_ctime = ext4_current_time(replace);
+ ext4_inc_count(handle, replace);
+ ihold(replace);
+
+ retval = ext4_setent(handle, &old,
+ replace->i_ino, file_type);
+ if (retval) {
+ drop_nlink(replace);
+ iput(replace);
+ goto end_rename;
+ }
+ ext4_mark_inode_dirty(handle, replace);
+ if (replace->i_nlink == 1)
+ ext4_orphan_del(handle, replace);
+ d_instantiate(replace_dst, replace);
+ }
}

/*
@@ -3254,7 +3288,7 @@ static int ext4_rename2(struct inode *ol
old.inode->i_ctime = ext4_current_time(old.inode);
ext4_mark_inode_dirty(handle, old.inode);

- if (overwrite) {
+ if (overwrite && !replace) {
/*
* ok, that's it
*/
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3937,6 +3937,17 @@ SYSCALL_DEFINE2(link, const char __user
return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}

+static void rename_lock_for_replace(struct inode *target, struct inode *replace)
+{
+ if (S_ISDIR(target->i_mode) || target < replace) {
+ mutex_lock(&target->i_mutex);
+ mutex_lock_nested(&replace->i_mutex, I_MUTEX_REPLACE);
+ } else {
+ mutex_lock(&replace->i_mutex);
+ mutex_lock_nested(&target->i_mutex, I_MUTEX_REPLACE);
+ }
+}
+
/*
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
@@ -3966,7 +3977,7 @@ SYSCALL_DEFINE2(link, const char __user
*/
static int vfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ struct dentry *replace_src, unsigned int flags)
{
int error;
const unsigned char *old_name;
@@ -3976,10 +3987,22 @@ static int vfs_rename2(struct inode *old
bool new_is_dir = target ? S_ISDIR(target->i_mode) : false;
bool overwrite = !(flags & RENAME_EXCHANGE);
unsigned max_links = new_dir->i_sb->s_max_links;
+ struct dentry *replace_dst = NULL;
+ struct inode *replace = NULL;

if (source == target)
return 0;

+ if (flags & RENAME_REPLACE) {
+ if (!overwrite || !target)
+ return -EINVAL;
+ replace = replace_src->d_inode;
+ if (!replace)
+ return -ENOENT;
+ if (target == replace)
+ return -EINVAL;
+ }
+
error = may_delete(old_dir, old_dentry, is_dir);
if (error)
return error;
@@ -4027,10 +4050,37 @@ static int vfs_rename2(struct inode *old
return error;
}

+ if (flags & RENAME_REPLACE) {
+ if (old_dir->i_sb != replace->i_sb)
+ return -EXDEV;
+ if (IS_APPEND(replace) || IS_IMMUTABLE(replace))
+ return -EPERM;
+ if (S_ISDIR(replace->i_mode))
+ return -EPERM;
+ error = security_inode_link(replace_src, old_dir,
+ old_dentry);
+ if (error)
+ return error;
+
+ replace_dst = d_alloc(old_dentry->d_parent,
+ &old_dentry->d_name);
+ if (!replace_dst)
+ return -ENOMEM;
+ }
+
old_name = fsnotify_oldname_init(old_dentry->d_name.name);
dget(new_dentry);
- if (overwrite && target)
+ if (replace) {
+ rename_lock_for_replace(target, replace);
+ error = -ENOENT;
+ if (replace->i_nlink == 0 && !(replace->i_state & I_LINKABLE))
+ goto out;
+ error = -EMLINK;
+ if (max_links && replace->i_nlink >= max_links)
+ goto out;
+ } else if (overwrite && target) {
mutex_lock(&target->i_mutex);
+ }

error = -EBUSY;
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
@@ -4050,7 +4100,9 @@ static int vfs_rename2(struct inode *old

if (old_dir->i_op->rename2) {
error = old_dir->i_op->rename2(old_dir, old_dentry,
- new_dir, new_dentry, flags);
+ new_dir, new_dentry,
+ replace_src, replace_dst,
+ flags);
} else {
error = old_dir->i_op->rename(old_dir, old_dentry,
new_dir, new_dentry);
@@ -4065,14 +4117,27 @@ static int vfs_rename2(struct inode *old
dont_mount(new_dentry);
}
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
- if (overwrite)
+ if (overwrite) {
d_move(old_dentry, new_dentry);
- else
+ if (replace) {
+ d_rehash(replace_dst);
+ if (replace->i_state & I_LINKABLE) {
+ spin_lock(&replace->i_lock);
+ replace->i_state &= ~I_LINKABLE;
+ spin_unlock(&replace->i_lock);
+ }
+
+ }
+ } else {
d_exchange(old_dentry, new_dentry);
+ }
}
out:
+ if (replace)
+ mutex_unlock(&replace->i_mutex);
if (overwrite && target)
mutex_unlock(&target->i_mutex);
+ dput(replace_dst);
dput(new_dentry);
if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
@@ -4080,6 +4145,8 @@ static int vfs_rename2(struct inode *old
if (!overwrite) {
fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
new_is_dir, NULL, new_dentry);
+ } else if (replace) {
+ fsnotify_link(old_dir, replace, replace_dst);
}
}
fsnotify_oldname_free(old_name);
@@ -4090,12 +4157,13 @@ static int vfs_rename2(struct inode *old
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- return vfs_rename2(old_dir, old_dentry, new_dir, new_dentry, 0);
+ return vfs_rename2(old_dir, old_dentry, new_dir, new_dentry, NULL, 0);
}


-SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
- int, newdfd, const char __user *, newname, unsigned int, flags)
+SYSCALL_DEFINE6(renameat2, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname,
+ int, replacefd, unsigned int, flags)
{
struct dentry *old_dir, *new_dir;
struct dentry *old_dentry, *new_dentry;
@@ -4107,9 +4175,10 @@ SYSCALL_DEFINE5(renameat2, int, olddfd,
bool should_retry = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
int error;
+ struct dentry *replace_src = NULL;

error = -EOPNOTSUPP;
- if (flags & ~RENAME_EXCHANGE)
+ if (flags & ~(RENAME_EXCHANGE | RENAME_REPLACE))
goto exit;

retry:
@@ -4147,6 +4216,27 @@ SYSCALL_DEFINE5(renameat2, int, olddfd,
if (overwrite)
newnd.flags |= LOOKUP_RENAME_TARGET;

+
+ if (flags & RENAME_REPLACE) {
+ struct fd f = fdget_raw(replacefd);
+ error = -EBADF;
+ if (!f.file)
+ goto exit2;
+
+ replace_src = dget(f.file->f_path.dentry);
+
+ error = -EXDEV;
+ if (f.file->f_path.mnt != oldnd.path.mnt) {
+ fdput(f);
+ goto exit2;
+ }
+
+ error = may_linkat(&f.file->f_path);
+ fdput(f);
+ if (error)
+ goto exit2;
+ }
+
trap = lock_rename(new_dir, old_dir);

old_dentry = lookup_hash(&oldnd);
@@ -4198,10 +4288,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd,
&oldnd.path, old_dentry);
if (error)
goto exit5;
+ } else if (replace_src) {
+ error = security_path_link(replace_src, &oldnd.path,
+ old_dentry);
+ if (error)
+ goto exit5;
}

error = vfs_rename2(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry, flags);
+ new_dir->d_inode, new_dentry,
+ replace_src, flags);
exit5:
dput(new_dentry);
exit4:
@@ -4210,6 +4306,7 @@ SYSCALL_DEFINE5(renameat2, int, olddfd,
unlock_rename(new_dir, old_dir);
mnt_drop_write(oldnd.path.mnt);
exit2:
+ dput(replace_src);
if (retry_estale(error, lookup_flags))
should_retry = true;
path_put(&newnd.path);
@@ -4229,12 +4326,12 @@ SYSCALL_DEFINE5(renameat2, int, olddfd,
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
- return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
+ return sys_renameat2(olddfd, oldname, newdfd, newname, -1, 0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
- return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, -1, 0);
}

int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -634,7 +634,8 @@ enum inode_i_mutex_lock_class
I_MUTEX_PARENT,
I_MUTEX_CHILD,
I_MUTEX_XATTR,
- I_MUTEX_QUOTA
+ I_MUTEX_QUOTA,
+ I_MUTEX_REPLACE,
};

/*
@@ -1573,7 +1574,9 @@ struct inode_operations {
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*rename2) (struct inode *, struct dentry *,
- struct inode *, struct dentry *, unsigned int);
+ struct inode *, struct dentry *,
+ struct dentry *, struct dentry *,
+ unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2644,22 +2644,28 @@ extern void initialize_dirent_tail(struc
extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
+
+static inline unsigned char ext4_type_by_mode(umode_t mode)
+{
#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
-};
+ static const unsigned char __ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+ };
+ return __ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+#undef S_SHIFT
+}

static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = ext4_type_by_mode(mode);
}


--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -36,6 +36,7 @@
#define SEEK_MAX SEEK_HOLE

#define RENAME_EXCHANGE (1 << 0) /* Exchange source and dest */
+#define RENAME_REPLACE (1 << 1) /* Link replacement to source */

struct fstrim_range {
__u64 start;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/