[PATCH 05/13] vfs: Replace array of file pointers with an IDR

From: Sandhya Bankar
Date: Sat Apr 29 2017 - 03:42:21 EST


Instead of storing all the file pointers in a single array, use an
IDR. It is RCU-safe, and does not need to be reallocated when the
fd array grows. It also handles allocation of new file descriptors.

Signed-off-by: Sandhya Bankar <bankarsandhya512@xxxxxxxxx>
[mawilcox@xxxxxxxxxxxxx: fixes]
Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
---
fs/file.c | 180 ++++++++++++++++++++----------------------------
include/linux/fdtable.h | 10 +--
2 files changed, 79 insertions(+), 111 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index ad6f094..1c000d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -47,7 +47,6 @@ static void *alloc_fdmem(size_t size)

static void __free_fdtable(struct fdtable *fdt)
{
- kvfree(fdt->fd);
kvfree(fdt->open_fds);
kfree(fdt);
}
@@ -89,15 +88,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- unsigned int cpy, set;
-
BUG_ON(nfdt->max_fds < ofdt->max_fds);
-
- cpy = ofdt->max_fds * sizeof(struct file *);
- set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
- memcpy(nfdt->fd, ofdt->fd, cpy);
- memset((char *)nfdt->fd + cpy, 0, set);
-
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}

@@ -131,15 +122,11 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (!fdt)
goto out;
fdt->max_fds = nr;
- data = alloc_fdmem(nr * sizeof(struct file *));
- if (!data)
- goto out_fdt;
- fdt->fd = data;

data = alloc_fdmem(max_t(size_t,
2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
if (!data)
- goto out_arr;
+ goto out_fdt;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
@@ -148,8 +135,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)

return fdt;

-out_arr:
- kvfree(fdt->fd);
out_fdt:
kfree(fdt);
out:
@@ -170,6 +155,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt;

spin_unlock(&files->file_lock);
+ idr_preload_end();
new_fdt = alloc_fdtable(nr);

/* make sure all __fd_install() have seen resize_in_progress
@@ -178,6 +164,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
if (atomic_read(&files->count) > 1)
synchronize_sched();

+ idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -228,8 +215,10 @@ static int expand_files(struct files_struct *files, unsigned int nr)

if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
+ idr_preload_end();
expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
+ idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
goto repeat;
}
@@ -290,8 +279,8 @@ static unsigned int count_open_files(struct fdtable *fdt)
struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
- struct file **old_fds, **new_fds;
unsigned int open_files, i;
+ struct file *f;
struct fdtable *old_fdt, *new_fdt;

*errorp = -ENOMEM;
@@ -302,6 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);

spin_lock_init(&newf->file_lock);
+ idr_init(&newf->fd_idr);
newf->resize_in_progress = false;
init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
@@ -310,8 +300,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
- new_fdt->fd = &newf->fd_array[0];

+restart:
+ idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
open_files = count_open_files(old_fdt);
@@ -321,6 +312,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
*/
while (unlikely(open_files > new_fdt->max_fds)) {
spin_unlock(&oldf->file_lock);
+ idr_preload_end();

if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
@@ -343,41 +335,50 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
* who knows it may have a new bigger fd table. We need
* the latest pointer.
*/
+ idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
open_files = count_open_files(old_fdt);
}

+ if (!idr_check_preload(&oldf->fd_idr)) {
+ spin_unlock(&oldf->file_lock);
+ idr_preload_end();
+ goto restart;
+ }
+
copy_fd_bitmaps(new_fdt, old_fdt, open_files);

- old_fds = old_fdt->fd;
- new_fds = new_fdt->fd;
-
- for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
- if (f) {
- get_file(f);
- } else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
- __clear_open_fd(open_files - i, new_fdt);
+ idr_for_each_entry(&oldf->fd_idr, f, i) {
+ int err;
+
+ get_file(f);
+ err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT);
+ if (WARN(err != i, "Could not allocate %d: %d", i, err)) {
+ spin_unlock(&oldf->file_lock);
+ goto out;
}
- rcu_assign_pointer(*new_fds++, f);
}
+
spin_unlock(&oldf->file_lock);
+ idr_preload_end();

- /* clear the remainder */
- memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
+ /*
+ * The fd may be claimed in the fd bitmap but not yet
+ * instantiated in the files array if a sibling thread
+ * is partway through open().
+ */
+ for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) {
+ if (!idr_find(&newf->fd_idr, i))
+ __clear_bit(i, new_fdt->open_fds);
+ }

rcu_assign_pointer(newf->fdt, new_fdt);

return newf;

out_release:
+ idr_destroy(&newf->fd_idr);
kmem_cache_free(files_cachep, newf);
out:
return NULL;
@@ -401,7 +402,8 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file *file;
+ file = idr_remove(&files->fd_idr, i);
if (file) {
filp_close(file, files);
cond_resched_rcu_qs();
@@ -469,28 +471,14 @@ struct files_struct init_files = {
.fdt = &init_files.fdtab,
.fdtab = {
.max_fds = NR_OPEN_DEFAULT,
- .fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .fd_idr = IDR_INIT,
};

-static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
-{
- unsigned int maxfd = fdt->max_fds;
- unsigned int maxbit = maxfd / BITS_PER_LONG;
- unsigned int bitbit = start / BITS_PER_LONG;
-
- bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
- return maxfd;
- if (bitbit > start)
- start = bitbit;
- return find_next_zero_bit(fdt->open_fds, maxfd, start);
-}
-
/*
* allocate a file descriptor, mark it busy.
*/
@@ -501,54 +489,37 @@ int __alloc_fd(struct files_struct *files,
int error;
struct fdtable *fdt;

+ idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
-repeat:
- fdt = files_fdtable(files);
- fd = start;
- if (fd < files->next_fd)
- fd = files->next_fd;

- if (fd < fdt->max_fds)
- fd = find_next_fd(fdt, fd);
-
- /*
- * N.B. For clone tasks sharing a files structure, this test
- * will limit the total number of files that can be opened.
- */
- error = -EMFILE;
- if (fd >= end)
+ error = idr_alloc(&files->fd_idr, NULL, start, end, GFP_NOWAIT);
+ if (error == -ENOSPC) {
+ error = -EMFILE;
goto out;
+ }
+ BUG_ON(error < 0);
+ fd = error;

error = expand_files(files, fd);
- if (error < 0)
+ if (error < 0) {
+ idr_remove(&files->fd_idr, fd);
goto out;
-
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
- goto repeat;
+ }

if (start <= files->next_fd)
files->next_fd = fd + 1;

+ fdt = files_fdtable(files);
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
-#if 1
- /* Sanity check */
- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- }
-#endif

out:
spin_unlock(&files->file_lock);
+ idr_preload_end();
return error;
}

@@ -575,6 +546,7 @@ void put_unused_fd(unsigned int fd)
{
struct files_struct *files = current->files;
spin_lock(&files->file_lock);
+ BUG_ON(idr_remove(&files->fd_idr, fd));
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
}
@@ -604,22 +576,9 @@ void put_unused_fd(unsigned int fd)
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
- struct fdtable *fdt;
-
- might_sleep();
- rcu_read_lock_sched();
-
- while (unlikely(files->resize_in_progress)) {
- rcu_read_unlock_sched();
- wait_event(files->resize_wait, !files->resize_in_progress);
- rcu_read_lock_sched();
- }
- /* coupled with smp_wmb() in expand_fdtable() */
- smp_rmb();
- fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
- rcu_read_unlock_sched();
+ rcu_read_lock();
+ BUG_ON(idr_replace(&files->fd_idr, file, fd));
+ rcu_read_unlock();
}

void fd_install(unsigned int fd, struct file *file)
@@ -641,10 +600,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
- file = fdt->fd[fd];
+ file = idr_remove(&files->fd_idr, fd);
if (!file)
goto out_unlock;
- rcu_assign_pointer(fdt->fd[fd], NULL);
__clear_close_on_exec(fd, fdt);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
@@ -676,10 +634,9 @@ void do_close_on_exec(struct files_struct *files)
struct file *file;
if (!(set & 1))
continue;
- file = fdt->fd[fd];
+ file = idr_remove(&files->fd_idr, fd);
if (!file)
continue;
- rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
filp_close(file, files);
@@ -842,17 +799,27 @@ static int do_dup2(struct files_struct *files,
* tables and this condition does not arise without those.
*/
fdt = files_fdtable(files);
- tofree = fdt->fd[fd];
+ tofree = idr_find(&files->fd_idr, fd);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
- rcu_assign_pointer(fdt->fd[fd], file);
+ if (tofree) {
+ idr_replace(&files->fd_idr, file, fd);
+ } else {
+ int err = idr_alloc(&files->fd_idr, file, fd, fd + 1,
+ GFP_NOWAIT);
+ if (err != fd) {
+ WARN(1, "do_dup2 got %d for fd %d\n", err, fd);
+ goto Ebusy;
+ }
+ }
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
+ idr_preload_end();

if (tofree)
filp_close(tofree, files);
@@ -861,6 +828,7 @@ static int do_dup2(struct files_struct *files,

Ebusy:
spin_unlock(&files->file_lock);
+ idr_preload_end();
return -EBUSY;
}

@@ -875,6 +843,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
if (fd >= rlimit(RLIMIT_NOFILE))
return -EBADF;

+ idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
err = expand_files(files, fd);
if (unlikely(err < 0))
@@ -883,6 +852,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)

out_unlock:
spin_unlock(&files->file_lock);
+ idr_preload_end();
return err;
}

@@ -901,6 +871,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
if (newfd >= rlimit(RLIMIT_NOFILE))
return -EBADF;

+ idr_preload(GFP_KERNEL);
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
file = fcheck(oldfd);
@@ -917,6 +888,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
err = -EBADF;
out_unlock:
spin_unlock(&files->file_lock);
+ idr_preload_end();
return err;
}

@@ -974,7 +946,7 @@ int iterate_fd(struct files_struct *files, unsigned n,
spin_lock(&files->file_lock);
for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
struct file *file;
- file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+ file = __fcheck_files(files, n);
if (!file)
continue;
res = f(p, file, n);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 6e84b2cae..4072f24 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/idr.h>

#include <linux/atomic.h>

@@ -23,7 +24,6 @@

struct fdtable {
unsigned int max_fds;
- struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
unsigned long *open_fds;
unsigned long *full_fds_bits;
@@ -51,6 +51,7 @@ struct files_struct {
bool resize_in_progress;
wait_queue_head_t resize_wait;

+ struct idr fd_idr;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
@@ -61,7 +62,6 @@ struct files_struct {
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
- struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
@@ -79,11 +79,7 @@ struct files_struct {
*/
static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
{
- struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-
- if (fd < fdt->max_fds)
- return rcu_dereference_raw(fdt->fd[fd]);
- return NULL;
+ return idr_find(&files->fd_idr, fd);
}

static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
--
1.8.3.1