Re: Update of file offset on write() etc. is non-atomic with I/O

From: Linus Torvalds
Date: Mon Mar 03 2014 - 18:24:10 EST


On Mon, Mar 3, 2014 at 2:55 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> I think I'll respin this with the compat readv/writev case fixed and
> with the bitfield replaced with an "unsigned int" that we just do
> bitops by hand on. Because that code generation makes me feel slightly
> ill.

Ok, how about this version? For some reason I thought the compat
functions for readv/writev were in fs/compat.c, but they are there in
the same fs/read_write.c, so I didn't need to move the helper
functions away.

This just uses a "flags" field, and we currently only have two bits
that we use: FDPUT_FPUT and FDPUT_POS_UNLOCK. The first patch knows
that "fget_light()" writes 0/1 for that, which is the same as the
FDPUT_FPUT bit. I didn't bother to comment on it or clean it up, since
the second patch just removes that whole fget_light() mess.

Comments?

Linus
From 9ed5720e8a3139cc703ae19aef5fd3a72658d132 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Mon, 3 Mar 2014 09:36:58 -0800
Subject: [PATCH 1/2] vfs: atomic f_pos accesses as per POSIX

Our write() system call has always been atomic in the sense that you get
the expected thread-safe contiguous write, but we haven't actually
guaranteed that concurrent writes are serialized wrt f_pos accesses, so
threads (or processes) that share a file descriptor and use "write()"
concurrently would quite likely overwrite each others data.

This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says:

"2.9.7 Thread Interactions with Regular File Operations

All of the following functions shall be atomic with respect to each
other in the effects specified in POSIX.1-2008 when they operate on
regular files or symbolic links: [...]"

and one of the effects is the file position update.

This unprotected file position behavior is not new behavior, and nobody
has ever cared. Until now. Yongzhi Pan reported unexpected behavior to
Michael Kerrisk that was due to this.

This resolves the issue with a f_pos-specific lock that is taken by
read/write/lseek on file descriptors that may be shared across threads
or processes.

Reported-by: Yongzhi Pan <panyongzhi@xxxxxxxxx>
Reported-by: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
fs/file_table.c | 1 +
fs/namei.c | 2 +-
fs/open.c | 4 ++++
fs/read_write.c | 54 ++++++++++++++++++++++++++++++++++++++--------------
include/linux/file.h | 6 ++++--
include/linux/fs.h | 6 +++++-
6 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
+ mutex_init(&f->f_pos_lock);
eventpoll_init_file(f);
/* f->f_version: 0 */
return f;
diff --git a/fs/namei.c b/fs/namei.c
index 385f7817bfcc..2f730ef9b4b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,

nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
- if (f.need_put)
+ if (f.flags & FDPUT_FPUT)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
rcu_read_lock();
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
return 0;
}

+ /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+ if (S_ISREG(inode->i_mode))
+ f->f_mode |= FMODE_ATOMIC_POS;
+
f->f_op = fops_get(inode->i_fop);
if (unlikely(WARN_ON(!f->f_op))) {
error = -ENODEV;
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..e5b43dd53b94 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);

+/*
+ * We only lock f_pos if we have threads or if the file might be
+ * shared with another process. In both cases we'll have an elevated
+ * file count (done either by fdget() or by fork()).
+ */
+static struct fd fdget_pos(int fd)
+{
+ struct fd f = fdget(fd);
+ struct file *file = f.file;
+
+ if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+ if (file_count(file) > 1) {
+ f.flags |= FDPUT_POS_UNLOCK;
+ mutex_lock(&file->f_pos_lock);
+ }
+ }
+ return f;
+}
+
+static void fdput_pos(struct fd f)
+{
+ if (f.flags & FDPUT_POS_UNLOCK)
+ mutex_unlock(&f.file->f_pos_lock);
+ fdput(f);
+}
+
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;

@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput(f);
+ fdput_pos(f);
return retval;
}

@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
}
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

if (ret > 0)
@@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;

@@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
ret = compat_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}

@@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
const struct compat_iovec __user *, vec,
compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;

@@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
ret = compat_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}

diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..f2517fa2d610 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)

struct fd {
struct file *file;
- int need_put;
+ unsigned int flags;
};
+#define FDPUT_FPUT 1
+#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
- if (fd.need_put)
+ if (fd.flags & FDPUT_FPUT)
fput(fd.file);
}

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH ((__force fmode_t)0x4000)

+/* File needs atomic accesses to f_pos */
+#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)

@@ -780,13 +783,14 @@ struct file {
const struct file_operations *f_op;

/*
- * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+ * Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
+ struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
--
1.9.0

From f55e00e40e6aba3e1e8fc701a7069a0b7a0868b7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Mon, 3 Mar 2014 14:43:03 -0800
Subject: [PATCH 2/2] vfs: get rid of old legacy "f{get,put}_[raw_]_light()"
interfaces

Everything but the net/socket.c use has long since been converted to the
fdget() model that has a cleaner calling convention, so just get rid of
the old interface and teach the socket code to use the new model using a
trivial shim layer.

Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
fs/file.c | 30 +++++++++++++++---------------
include/linux/file.h | 24 ++----------------------
net/socket.c | 17 ++++++++++++-----
3 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index db25c2bdfe46..baaa755f3919 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -683,33 +683,33 @@ EXPORT_SYMBOL(fget_raw);
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
-struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed)
+static inline struct fd __fdget(unsigned int fd, fmode_t mask)
{
struct files_struct *files = current->files;
- struct file *file;
+ struct fd f = { NULL, };

- *fput_needed = 0;
if (atomic_read(&files->count) == 1) {
- file = __fcheck_files(files, fd);
- if (file && (file->f_mode & mask))
- file = NULL;
+ f.file = __fcheck_files(files, fd);
+ if (f.file && (f.file->f_mode & mask))
+ f.file = NULL;
} else {
- file = __fget(fd, mask);
- if (file)
- *fput_needed = 1;
+ f.file = __fget(fd, mask);
+ if (f.file)
+ f.flags = FDPUT_FPUT;
}

- return file;
+ return f;
}
-struct file *fget_light(unsigned int fd, int *fput_needed)
+
+struct fd fdget(unsigned int fd)
{
- return __fget_light(fd, FMODE_PATH, fput_needed);
+ return __fdget(fd, FMODE_PATH);
}
-EXPORT_SYMBOL(fget_light);
+EXPORT_SYMBOL(fdget);

-struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+struct fd fdget_raw(unsigned int fd)
{
- return __fget_light(fd, 0, fput_needed);
+ return __fdget(fd, 0);
}

void set_close_on_exec(unsigned int fd, int flag)
diff --git a/include/linux/file.h b/include/linux/file.h
index f2517fa2d610..1f6b39394a9e 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -20,12 +20,6 @@ struct path;
extern struct file *alloc_file(struct path *, fmode_t mode,
const struct file_operations *fop);

-static inline void fput_light(struct file *file, int fput_needed)
-{
- if (fput_needed)
- fput(file);
-}
-
struct fd {
struct file *file;
unsigned int flags;
@@ -40,24 +34,10 @@ static inline void fdput(struct fd fd)
}

extern struct file *fget(unsigned int fd);
-extern struct file *fget_light(unsigned int fd, int *fput_needed);
-
-static inline struct fd fdget(unsigned int fd)
-{
- int b;
- struct file *f = fget_light(fd, &b);
- return (struct fd){f,b};
-}
-
extern struct file *fget_raw(unsigned int fd);
-extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);

-static inline struct fd fdget_raw(unsigned int fd)
-{
- int b;
- struct file *f = fget_raw_light(fd, &b);
- return (struct fd){f,b};
-}
+extern struct fd fdget(unsigned int fd);
+extern struct fd fdget_raw(unsigned int fd);

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
diff --git a/net/socket.c b/net/socket.c
index 879933aaed4c..5ef37af499b3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -448,18 +448,25 @@ struct socket *sockfd_lookup(int fd, int *err)
}
EXPORT_SYMBOL(sockfd_lookup);

+static void fput_light(struct file *file, int flags)
+{
+ struct fd f = { file, flags };
+ fdput(f);
+}
+
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
- struct file *file;
+ struct fd f;
struct socket *sock;

*err = -EBADF;
- file = fget_light(fd, fput_needed);
- if (file) {
- sock = sock_from_file(file, err);
+ f = fdget(fd);
+ *fput_needed = f.flags;
+ if (f.file) {
+ sock = sock_from_file(f.file, err);
if (sock)
return sock;
- fput_light(file, *fput_needed);
+ fdput(f);
}
return NULL;
}
--
1.9.0