Re: [RFC][PATCH] fix move/migrate_pages() race on task struct

From: Eric W. Biederman
Date: Fri Feb 24 2012 - 18:12:29 EST


Christoph Lameter <cl@xxxxxxxxx> writes:

> On Fri, 24 Feb 2012, Dave Hansen wrote:
>
>> > Is that all safe? If not then we need to take a refcount on the task
>> > struct after all.
>>
>> Urg, no we can't sleep under an rcu_read_lock().
>
> Ok so take a count and drop it before entering the main migration
> function?

As an alternative way at looking at things.

Taking a quick look it does appear that in cpuset_mems_allowed and it's
cousins we never sleep under "callback_mutex" so that lock looks like it
could become a spinlock.

But I have to say something just bothers me about the permissions for
modifying an mm living in the task. We can have different rules
for modifying an mm depending on the path to tme mm?

Especially in things like which numa nodes we can put pages in?

So by specifying a different pid to access them mm through the call can
either work or succeed? Are these checks really sane?

Eric

> ---
> mm/mempolicy.c | 12 +++++++-----
> mm/migrate.c | 20 +++++++++++---------
> 2 files changed, 18 insertions(+), 14 deletions(-)
>
> Index: linux-2.6/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.orig/mm/mempolicy.c 2012-02-24 04:10:01.621614996 -0600
> +++ linux-2.6/mm/mempolicy.c 2012-02-24 05:01:43.621530156 -0600
> @@ -1293,7 +1293,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
> {
>
> const struct cred *cred = current_cred(), *tcred;
> struct mm_struct *mm = NULL;
> - struct task_struct *task;
> + struct task_struct *task = NULL;
> nodemask_t task_nodes;
> int err;
> nodemask_t *old;
> @@ -1318,10 +1318,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
> rcu_read_lock();
> task = pid ? find_task_by_vpid(pid) : current;
> if (!task) {
> - rcu_read_unlock();
> err = -ESRCH;
> goto out;
> }
> + get_task_struct(task);
> mm = get_task_mm(task);
> rcu_read_unlock();
>
> @@ -1335,16 +1335,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
> * capabilities, superuser privileges or the same
> * userid as the target process.
> */
> - rcu_read_lock();
> tcred = __task_cred(task);
> if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
> cred->uid != tcred->suid && cred->uid != tcred->uid &&
> !capable(CAP_SYS_NICE)) {
> - rcu_read_unlock();
> err = -EPERM;
> goto out;
> }
> - rcu_read_unlock();
>
> task_nodes = cpuset_mems_allowed(task);
> /* Is the user allowed to access the target nodes? */
> @@ -1362,9 +1359,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pi
> if (err)
> goto out;
>
> + put_task_struct(task);
> + task = NULL;
> err = do_migrate_pages(mm, old, new,
> capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
> out:
> + if (task)
> + put_task_struct(task);
> +
> if (mm)
> mmput(mm);
> NODEMASK_SCRATCH_FREE(scratch);
> Index: linux-2.6/mm/migrate.c
> ===================================================================
> --- linux-2.6.orig/mm/migrate.c 2012-02-24 04:10:01.609614993 -0600
> +++ linux-2.6/mm/migrate.c 2012-02-24 05:07:39.493520424 -0600
> @@ -1176,20 +1176,17 @@ set_status:
> * Migrate an array of page address onto an array of nodes and fill
> * the corresponding array of status.
> */
> -static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
> +static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
> unsigned long nr_pages,
> const void __user * __user *pages,
> const int __user *nodes,
> int __user *status, int flags)
> {
> struct page_to_node *pm;
> - nodemask_t task_nodes;
> unsigned long chunk_nr_pages;
> unsigned long chunk_start;
> int err;
>
> - task_nodes = cpuset_mems_allowed(task);
> -
> err = -ENOMEM;
> pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
> if (!pm)
> @@ -1351,6 +1348,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
> struct task_struct *task;
> struct mm_struct *mm;
> int err;
> + nodemask_t task_nodes;
>
> /* Check flags */
> if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
> @@ -1366,6 +1364,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
> rcu_read_unlock();
> return -ESRCH;
> }
> + get_task_struct(task);
> mm = get_task_mm(task);
> rcu_read_unlock();
>
> @@ -1378,30 +1377,33 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid,
> * capabilities, superuser privileges or the same
> * userid as the target process.
> */
> - rcu_read_lock();
> tcred = __task_cred(task);
> if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
> cred->uid != tcred->suid && cred->uid != tcred->uid &&
> !capable(CAP_SYS_NICE)) {
> - rcu_read_unlock();
> err = -EPERM;
> goto out;
> }
> - rcu_read_unlock();
>
> err = security_task_movememory(task);
> if (err)
> goto out;
>
> + task_nodes = cpuset_mems_allowed(task);
> + put_task_struct(task);
> + task = NULL;
> +
> if (nodes) {
> - err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
> - flags);
> + err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes,
> + status, flags);
> } else {
> err = do_pages_stat(mm, nr_pages, pages, status);
> }
>
> out:
> mmput(mm);
> + if (task)
> + put_task_struct(task);
> return err;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/