Re: [RFC][PATCH] fix move/migrate_pages() race on task struct

From: Christoph Lameter
Date: Fri Feb 24 2012 - 10:20:56 EST


On Thu, 23 Feb 2012, Eric W. Biederman wrote:

> > The bug in migrate_pages() is that we do a rcu_unlock and a rcu_lock. If
> > we drop those then we should be safe if the use of a task pointer within a
> > rcu section is safe without taking a refcount.
>
> Yes the user of a task_struct pointer found via a userspace pid is valid
> for the life of an rcu critical section, and the bug is indeed that we
> drop the rcu_lock and somehow expect the task to remain valid.
>
> The guarantee comes from release_task. In release_task we call
> __exit_signal which calls __unhash_process, and then we call
> delayed_put_task to guarantee that the task lives until the end of the
> rcu interval.

Ah. Ok. Great.

> In migrate_pages we have a lot of task accesses outside of the rcu
> critical section, and without a reference count on task.

Yes but that is only of interesting for setup and verification of
permissions. What matters during migration is that the mm_struct does not
go away and we take a refcount on that one.

> I tell you the truth trying to figure out what that code needs to be
> correct if task != current makes my head hurt.

Hmm...

> I think we need to grab a reference on task_struct, to stop the task
> from going away, and in addition we need to hold task_lock. To keep
> task->mm from changing (see exec_mmap). But we can't do that and sleep
> so I think the entire function needs to be rewritten, and the need for
> task deep in the migrate_pages path needs to be removed as even with the
> reference count held we can race with someone calling exec.

We dont need the task during migration. We only need the mm. The task
is safe until rcu_read_unlock therefore maybe the following should fix
migrate pages:


Subject: migration: Do not do rcu_read_unlock until the last time we need the task_struct pointer

Migration functions perform the rcu_read_unlock too early. As a result the
task pointed to may change. Bugs were introduced when adding security checks
because rcu_unlock/lock sequences were inserted. Plus the security checks
and do_move_pages used the task_struct pointer after rcu_unlock.

Fix those issues by removing the unlock/lock sequences and moving the
rcu_read_unlock after the last use of the task struct pointer.

Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>


---
mm/mempolicy.c | 22 +++++++++++-----------
mm/migrate.c | 28 +++++++++++++++-------------
2 files changed, 26 insertions(+), 24 deletions(-)

Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c 2012-01-13 04:04:36.229807226 -0600
+++ linux-2.6/mm/mempolicy.c 2012-02-24 03:11:44.913710625 -0600
@@ -1318,16 +1318,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- rcu_read_unlock();
err = -ESRCH;
- goto out;
+ goto unlock_out;
}
mm = get_task_mm(task);
- rcu_read_unlock();

err = -EINVAL;
if (!mm)
- goto out;
+ goto unlock_out;

/*
* Check if this process has the right to modify the specified
@@ -1335,33 +1333,31 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
err = -EPERM;
- goto out;
+ goto unlock_out;
}
- rcu_read_unlock();

task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
- goto out;
+ goto unlock_out;
}

if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
- goto out;
+ goto unlock_out;
}

err = security_task_movememory(task);
if (err)
- goto out;
+ goto unlock_out;

+ rcu_read_unlock();
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
@@ -1370,6 +1366,10 @@ out:
NODEMASK_SCRATCH_FREE(scratch);

return err;
+
+unlock_out:
+ rcu_read_unlock();
+ goto out;
}


Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c 2012-02-06 04:25:35.857094372 -0600
+++ linux-2.6/mm/migrate.c 2012-02-24 03:18:55.569698851 -0600
@@ -1176,20 +1176,17 @@ set_status:
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
*/
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
unsigned long nr_pages,
const void __user * __user *pages,
const int __user *nodes,
int __user *status, int flags)
{
struct page_to_node *pm;
- nodemask_t task_nodes;
unsigned long chunk_nr_pages;
unsigned long chunk_start;
int err;

- task_nodes = cpuset_mems_allowed(task);
-
err = -ENOMEM;
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
@@ -1351,6 +1348,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
struct task_struct *task;
struct mm_struct *mm;
int err;
+ nodemask_t task_nodes;

/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1367,10 +1365,11 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
return -ESRCH;
}
mm = get_task_mm(task);
- rcu_read_unlock();

- if (!mm)
+ if (!mm) {
+ rcu_read_unlock();
return -EINVAL;
+ }

/*
* Check if this process has the right to modify the specified
@@ -1378,24 +1377,23 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
err = -EPERM;
- goto out;
+ goto unlock_out;
}
- rcu_read_unlock();

err = security_task_movememory(task);
if (err)
- goto out;
+ goto unlock_out;

+ task_nodes = cpuset_mems_allowed(task);
+ rcu_read_unlock();
if (nodes) {
- err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
- flags);
+ err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes,
+ status, flags);
} else {
err = do_pages_stat(mm, nr_pages, pages, status);
}
@@ -1403,6 +1401,10 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
out:
mmput(mm);
return err;
+
+unlock_out:
+ rcu_read_unlock();
+ goto out;
}

/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/