[RFC PATCH 3/4] vrange: Support fvrange() syscall for file based volatile ranges

From: John Stultz
Date: Wed Apr 03 2013 - 19:53:20 EST


Add vrange support on addres_space structures, and add fvrange()
syscall for creating ranges on address_space structures.

Cc: linux-mm@xxxxxxxxx
Cc: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
Cc: Arun Sharma <asharma@xxxxxx>
Cc: Mel Gorman <mel@xxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: Dave Hansen <dave@xxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Neil Brown <neilb@xxxxxxx>
Cc: Mike Hommey <mh@xxxxxxxxxxxx>
Cc: Taras Glek <tglek@xxxxxxxxxxx>
Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
Cc: Jason Evans <je@xxxxxx>
Cc: sanjay@xxxxxxxxxx
Cc: Paul Turner <pjt@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michel Lespinasse <walken@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Minchan Kim <minchan@xxxxxxxxxx>
Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx>
---
arch/x86/syscalls/syscall_64.tbl | 1 +
fs/file_table.c | 5 +++
fs/inode.c | 2 ++
include/linux/fs.h | 2 ++
include/linux/vrange.h | 19 +++++++++-
include/linux/vrange_types.h | 1 +
mm/vrange.c | 72 +++++++++++++++++++++++++++++++++++++-
7 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dc332bd..910d9f3 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -321,6 +321,7 @@
312 common kcmp sys_kcmp
313 common finit_module sys_finit_module
314 common vrange sys_vrange
+315 common fvrange sys_fvrange

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/file_table.c b/fs/file_table.c
index cd4d87a..61c8aaa 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -26,6 +26,7 @@
#include <linux/hardirq.h>
#include <linux/task_work.h>
#include <linux/ima.h>
+#include <linux/vrange.h>

#include <linux/atomic.h>

@@ -244,6 +245,10 @@ static void __fput(struct file *file)
file->f_op->fasync(-1, file, 0);
}
ima_file_free(file);
+
+ /* drop all vranges on last close */
+ mapping_exit_vrange(inode->i_mapping);
+
if (file->f_op && file->f_op->release)
file->f_op->release(inode, file);
security_file_free(file);
diff --git a/fs/inode.c b/fs/inode.c
index f5f7c06..4707c95 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
+#include <linux/vrange.h>
#include "internal.h"

/*
@@ -350,6 +351,7 @@ void address_space_init_once(struct address_space *mapping)
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT;
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+ mapping_init_vrange(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c28271..6f86c7c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -27,6 +27,7 @@
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
+#include <linux/vrange_types.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -411,6 +412,7 @@ struct address_space {
struct rb_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct mutex i_mmap_mutex; /* protect tree, count, list */
+ struct vrange_root vroot;
/* Protected by tree_lock together with the radix tree */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
diff --git a/include/linux/vrange.h b/include/linux/vrange.h
index b9b219c..91960eb 100644
--- a/include/linux/vrange.h
+++ b/include/linux/vrange.h
@@ -3,6 +3,7 @@

#include <linux/vrange_types.h>
#include <linux/mm.h>
+#include <linux/fs.h>

#define vrange_entry(ptr) \
container_of(ptr, struct vrange, node.rb)
@@ -11,10 +12,19 @@

static inline void mm_init_vrange(struct mm_struct *mm)
{
+ mm->vroot.type = VRANGE_ANON;
mm->vroot.v_rb = RB_ROOT;
mutex_init(&mm->vroot.v_lock);
}

+static inline void mapping_init_vrange(struct address_space *mapping)
+{
+ mapping->vroot.type = VRANGE_FILE;
+ mapping->vroot.v_rb = RB_ROOT;
+ mutex_init(&mapping->vroot.v_lock);
+}
+
+
static inline void vrange_lock(struct vrange_root *vroot)
{
mutex_lock(&vroot->v_lock);
@@ -25,15 +35,22 @@ static inline void vrange_unlock(struct vrange_root *vroot)
mutex_unlock(&vroot->v_lock);
}

-static inline struct mm_struct *vrange_get_owner_mm(struct vrange *vrange)
+static inline int vrange_type(struct vrange *vrange)
{
+ return vrange->owner->type;
+}

+static inline struct mm_struct *vrange_get_owner_mm(struct vrange *vrange)
+{
+ if (vrange_type(vrange) != VRANGE_ANON)
+ return NULL;
return container_of(vrange->owner, struct mm_struct, vroot);
}


void vrange_init(void);
extern void mm_exit_vrange(struct mm_struct *mm);
+extern void mapping_exit_vrange(struct address_space *mapping);
int discard_vpage(struct page *page);
bool vrange_address(struct mm_struct *mm, unsigned long start,
unsigned long end);
diff --git a/include/linux/vrange_types.h b/include/linux/vrange_types.h
index bede336..c7154e4 100644
--- a/include/linux/vrange_types.h
+++ b/include/linux/vrange_types.h
@@ -7,6 +7,7 @@
struct vrange_root {
struct rb_root v_rb; /* vrange rb tree */
struct mutex v_lock; /* Protect v_rb */
+ enum {VRANGE_ANON, VRANGE_FILE} type; /* range root type */
};


diff --git a/mm/vrange.c b/mm/vrange.c
index 9facbbc..671909c 100644
--- a/mm/vrange.c
+++ b/mm/vrange.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <linux/file.h>

struct vrange_walker_private {
struct zone *zone;
@@ -234,6 +235,20 @@ void mm_exit_vrange(struct mm_struct *mm)
}
}

+void mapping_exit_vrange(struct address_space *mapping)
+{
+ struct vrange *range;
+ struct rb_node *next;
+
+ next = rb_first(&mapping->vroot.v_rb);
+ while (next) {
+ range = vrange_entry(next);
+ next = rb_next(next);
+ __remove_range(range);
+ put_vrange(range);
+ }
+}
+
/*
* The vrange(2) system call.
*
@@ -291,6 +306,51 @@ out:
}


+SYSCALL_DEFINE5(fvrange, int, fd, size_t, offset,
+ size_t, len, int, mode, int, behavior)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ u64 start = offset;
+ u64 end;
+ int ret = -EINVAL;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ ret = -ESPIPE;
+ goto out;
+ }
+
+ mapping = f.file->f_mapping;
+ if (!mapping || len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (start & ~PAGE_MASK)
+ goto out;
+
+
+ len &= PAGE_MASK;
+ if (!len)
+ goto out;
+
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ if (mode == VRANGE_VOLATILE)
+ ret = add_vrange(&mapping->vroot, start, end - 1);
+ else if (mode == VRANGE_NOVOLATILE)
+ ret = remove_vrange(&mapping->vroot, start, end - 1);
+out:
+ fdput(f);
+ return ret;
+}
+
+
static bool __vrange_address(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
@@ -641,6 +701,9 @@ unsigned int discard_vrange(struct zone *zone, struct vrange *vrange,

mm = vrange_get_owner_mm(vrange);

+ if (!mm)
+ goto out;
+
if (!down_read_trylock(&mm->mmap_sem))
goto out;

@@ -683,6 +746,12 @@ static struct vrange *get_victim_vrange(void)
list_for_each_prev_safe(cur, tmp, &lru_vrange) {
vrange = list_entry(cur, struct vrange, lru);
mm = vrange_get_owner_mm(vrange);
+
+ if (!mm) {
+ vrange = NULL;
+ continue;
+ }
+
/* the process is exiting so pass it */
if (atomic_read(&mm->mm_users) == 0) {
list_del_init(&vrange->lru);
@@ -720,7 +789,8 @@ static void put_victim_range(struct vrange *vrange)
struct mm_struct *mm = vrange_get_owner_mm(vrange);

put_vrange(vrange);
- mmdrop(mm);
+ if (mm)
+ mmdrop(mm);
}

unsigned int discard_vrange_pages(struct zone *zone, int nr_to_discard)
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/