Re: [patch 1/2] mm: fincore()

From: Simon Jeons
Date: Tue Feb 19 2013 - 05:30:46 EST


Hi Johannes,
On 02/15/2013 02:34 PM, Johannes Weiner wrote:
On Mon, Feb 11, 2013 at 02:12:39PM -0800, Andrew Morton wrote:
Also, having to mmap the file to be able to query pagecache state is a
hack. Whatever happened to the fincore() patch?
I don't know, but how about this one:

---
From: Johannes Weiner <hannes@xxxxxxxxxxx>
Subject: [patch 1/2] mm: fincore()

Provide a syscall to determine whether a given file's pages are cached
in memory. This is more elegant than mmapping the file for the sole
purpose of using mincore(), and also works on NOMMU.

Who is the user of mincore()/fincore()? In which scenario user processes need to know their pages are resident in memory or not?

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
include/linux/syscalls.h | 2 +
mm/Makefile | 2 +-
mm/fincore.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 131 insertions(+), 1 deletion(-)
create mode 100644 mm/fincore.c

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 313a8e0..3ceab2a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -897,4 +897,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
unsigned long idx1, unsigned long idx2);
asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_fincore(unsigned int fd, loff_t start, loff_t len,
+ unsigned char __user * vec);
#endif
diff --git a/mm/Makefile b/mm/Makefile
index 185a22b..221cdae 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
- interval_tree.o $(mmu-y)
+ interval_tree.o fincore.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/fincore.c b/mm/fincore.c
new file mode 100644
index 0000000..d504611
--- /dev/null
+++ b/mm/fincore.c
@@ -0,0 +1,128 @@
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+static long do_fincore(struct address_space *mapping, pgoff_t pgstart,
+ unsigned long nr_pages, unsigned char *vec)
+{
+ pgoff_t pgend = pgstart + nr_pages;
+ struct radix_tree_iter iter;
+ void **slot;
+ long nr = 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, pgstart) {
+ unsigned char present;
+ struct page *page;
+
+ /* Handle holes */
+ if (iter.index != pgstart + nr) {
+ if (iter.index < pgend)
+ nr_pages = iter.index - pgstart;
+ break;
+ }
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(iter.index);
+ goto restart;
+ }
+ present = 0;
+ } else {
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ present = PageUptodate(page);
+ page_cache_release(page);
+ }
+ vec[nr] = present;
+
+ if (++nr == nr_pages)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (nr < nr_pages)
+ memset(vec + nr, 0, nr_pages - nr);
+
+ return nr_pages;
+}
+
+/*
+ * The fincore(2) system call.
+ *
+ * fincore() returns the memory residency status of the given file's
+ * pages, in the range [start, start + len].
+ * The status is returned in a vector of bytes. The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after fincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EBADF - fd isn't a valid open file descriptor
+ * -EFAULT - vec points to an illegal address
+ * -EINVAL - start is not a multiple of PAGE_CACHE_SIZE
+ */
+SYSCALL_DEFINE4(fincore, unsigned int, fd, loff_t, start, loff_t, len,
+ unsigned char __user *, vec)
+{
+ unsigned long nr_pages;
+ pgoff_t pgstart;
+ struct fd f;
+ long ret;
+
+ if (start & ~PAGE_CACHE_MASK)
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ pgstart = start >> PAGE_CACHE_SHIFT;
+ nr_pages = DIV_ROUND_UP(len, PAGE_CACHE_SIZE);
+
+ while (nr_pages) {
+ unsigned char tmp[64];
+
+ ret = do_fincore(f.file->f_mapping, pgstart,
+ min(nr_pages, sizeof(tmp)), tmp);
+ if (ret <= 0)
+ break;
+
+ if (copy_to_user(vec, tmp, ret)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ nr_pages -= ret;
+ pgstart += ret;
+ vec += ret;
+ ret = 0;
+ }
+
+ fdput(f);
+
+ return ret;
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/