Re: [WiP]: aio support for migrating pages (Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable())

From: Benjamin LaHaise
Date: Tue Jun 11 2013 - 10:45:31 EST


Hi Tang,

On Tue, Jun 11, 2013 at 05:42:31PM +0800, Tang Chen wrote:
> Hi Benjamin,
>
> Are you still working on this problem ?
>
> Thanks. :)

Below is a copy of the most recent version of this patch I have worked
on. This version works and stands up to my testing using move_pages() to
force the migration of the aio ring buffer. A test program is available
at http://www.kvack.org/~bcrl/aio/aio-numa-test.c . Please note that
this version is not suitable for mainline as the modifactions to the
anon inode code are undesirable, so that part needs reworking.

-ben


fs/aio.c | 113 ++++++++++++++++++++++++++++++++++++++++++++----
fs/anon_inodes.c | 14 ++++-
include/linux/migrate.h | 3 +
mm/migrate.c | 2
mm/swap.c | 1
5 files changed, 121 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index c5b1a8c..a951690 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -35,6 +35,9 @@
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/anon_inodes.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -108,6 +111,7 @@ struct kioctx {
} ____cacheline_aligned_in_smp;

struct page *internal_pages[AIO_RING_PAGES];
+ struct file *ctx_file;
};

/*------ sysctl variables----*/
@@ -136,18 +140,80 @@ __initcall(aio_setup);

static void aio_free_ring(struct kioctx *ctx)
{
- long i;
-
- for (i = 0; i < ctx->nr_pages; i++)
- put_page(ctx->ring_pages[i]);
+ int i;

if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);

+ if (ctx->ctx_file)
+ truncate_setsize(ctx->ctx_file->f_inode, 0);
+
+ for (i = 0; i < ctx->nr_pages; i++) {
+ pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+ page_count(ctx->ring_pages[i]));
+ put_page(ctx->ring_pages[i]);
+ }
+
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
kfree(ctx->ring_pages);
+
+ if (ctx->ctx_file) {
+ truncate_setsize(ctx->ctx_file->f_inode, 0);
+ pr_debug("pid(%d) i_nlink=%u d_count=%d, d_unhashed=%d i_count=%d\n",
+ current->pid, ctx->ctx_file->f_inode->i_nlink,
+ ctx->ctx_file->f_path.dentry->d_count,
+ d_unhashed(ctx->ctx_file->f_path.dentry),
+ atomic_read(&ctx->ctx_file->f_path.dentry->d_inode->i_count));
+ fput(ctx->ctx_file);
+ ctx->ctx_file = NULL;
+ }
+}
+
+static int aio_ctx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+static const struct file_operations aio_ctx_fops = {
+ .mmap = aio_ctx_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+ return 0;
+}
+
+static int aio_migratepage(struct address_space *mapping, struct page *new,
+ struct page *old, enum migrate_mode mode)
+{
+ struct kioctx *ctx = mapping->private_data;
+ unsigned long flags;
+ unsigned idx = old->index;
+ int rc;
+
+ BUG_ON(PageWriteback(old)); /* Writeback must be complete */
+ put_page(old);
+ rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+ if (rc != MIGRATEPAGE_SUCCESS) {
+ get_page(old);
+ return rc;
+ }
+ get_page(new);
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ migrate_page_copy(new, old);
+ ctx->ring_pages[idx] = new;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ return MIGRATEPAGE_SUCCESS;
}

+static const struct address_space_operations aio_ctx_aops = {
+ .set_page_dirty = aio_set_page_dirty,
+ .migratepage = aio_migratepage,
+};
+
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
@@ -155,6 +221,7 @@ static int aio_setup_ring(struct kioctx *ctx)
struct mm_struct *mm = current->mm;
unsigned long size, populate;
int nr_pages;
+ int i;

/* Compensate for the ring buffer's head/tail overlap entry */
nr_events += 2; /* 1 is required, 2 for good luck */
@@ -166,6 +233,28 @@ static int aio_setup_ring(struct kioctx *ctx)
if (nr_pages < 0)
return -EINVAL;

+ ctx->ctx_file = anon_inode_getfile("[aio]", &aio_ctx_fops, ctx, O_RDWR);
+ if (IS_ERR(ctx->ctx_file)) {
+ ctx->ctx_file = NULL;
+ return -EAGAIN;
+ }
+ ctx->ctx_file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
+ ctx->ctx_file->f_inode->i_mapping->private_data = ctx;
+ ctx->ctx_file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+
+ for (i=0; i<nr_pages; i++) {
+ struct page *page;
+ page = find_or_create_page(ctx->ctx_file->f_inode->i_mapping,
+ i, GFP_HIGHUSER | __GFP_ZERO);
+ if (!page)
+ break;
+ pr_debug("pid(%d) page[%d]->count=%d\n",
+ current->pid, i, page_count(page));
+ SetPageUptodate(page);
+ SetPageDirty(page);
+ unlock_page(page);
+ }
+
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);

ctx->nr_events = 0;
@@ -180,20 +269,25 @@ static int aio_setup_ring(struct kioctx *ctx)
ctx->mmap_size = nr_pages * PAGE_SIZE;
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
down_write(&mm->mmap_sem);
- ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
- PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+ ctx->mmap_base = do_mmap_pgoff(ctx->ctx_file, 0, ctx->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, 0,
+ &populate);
if (IS_ERR((void *)ctx->mmap_base)) {
up_write(&mm->mmap_sem);
ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
+ up_write(&mm->mmap_sem);
+ mm_populate(ctx->mmap_base, populate);

pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
1, 0, ctx->ring_pages, NULL);
- up_write(&mm->mmap_sem);
+ for (i=0; i<ctx->nr_pages; i++) {
+ put_page(ctx->ring_pages[i]);
+ }

if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
@@ -403,6 +497,8 @@ out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
out_freectx:
+ if (ctx->ctx_file)
+ fput(ctx->ctx_file);
kmem_cache_free(kioctx_cachep, ctx);
pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
@@ -852,6 +948,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
ioctx = ioctx_alloc(nr_events);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
+ ctx = ioctx->user_id;
ret = put_user(ioctx->user_id, ctxp);
if (ret)
kill_ioctx(ioctx);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47a65df..376d289 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -131,6 +131,7 @@ struct file *anon_inode_getfile(const char *name,
struct qstr this;
struct path path;
struct file *file;
+ struct inode *inode;

if (IS_ERR(anon_inode_inode))
return ERR_PTR(-ENODEV);
@@ -138,6 +139,12 @@ struct file *anon_inode_getfile(const char *name,
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);

+ inode = anon_inode_mkinode(anon_inode_inode->i_sb);
+ if (IS_ERR(inode)) {
+ file = ERR_PTR(-ENOMEM);
+ goto err_module;
+ }
+
/*
* Link the inode to a directory entry by creating a unique name
* using the inode sequence number.
@@ -155,17 +162,18 @@ struct file *anon_inode_getfile(const char *name,
* We know the anon_inode inode count is always greater than zero,
* so ihold() is safe.
*/
- ihold(anon_inode_inode);
+ //ihold(inode);

- d_instantiate(path.dentry, anon_inode_inode);
+ d_instantiate(path.dentry, inode);

file = alloc_file(&path, OPEN_FMODE(flags), fops);
if (IS_ERR(file))
goto err_dput;
- file->f_mapping = anon_inode_inode->i_mapping;
+ file->f_mapping = inode->i_mapping;

file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
file->private_data = priv;
+ drop_nlink(inode);

return file;

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..b6f3289 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
+extern int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode);
#else

static inline void putback_lru_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed225..ac9c3a9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
-static int migrate_page_move_mapping(struct address_space *mapping,
+int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71..bbfba0a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -160,6 +160,7 @@ skip_lock_tail:

void put_page(struct page *page)
{
+ BUG_ON(page_count(page) <= 0);
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/