Re: [PATCH 1/9] exec_path 1/9: introduce ->exec_path and switch/proc/*/exe

From: Matt Helsley
Date: Thu Jun 04 2009 - 03:56:00 EST


On Thu, Jun 04, 2009 at 03:04:22AM +0400, Alexey Dobriyan wrote:
> On Sun, May 31, 2009 at 03:19:53PM -0700, Andrew Morton wrote:
> > On Mon, 1 Jun 2009 01:54:27 +0400 Alexey Dobriyan <adobriyan@xxxxxxxxx> wrote:
> >
> > > And BTW, there is something unnatural when executable path is attached
> > > to mm_struct(!) not task_struct,
> >
> > mm_struct is the central object for a heavyweight process. All threads
> > within that process share the same executable path (don't they?) so
> > attaching the executable path to the mm seems OK to me.
>
> OK, let's try this:
>
>
> [PATCH 1/9] exec_path 1/9: introduce ->exec_path and switch /proc/*/exe
>
> ->exec_path marks executable which is associated with running task.
> Binfmt loader decides which executable is such and can, in theory,
> assign anything. Unlike current status quo when first VM_EXECUTABLE mapping is
> sort of marks running executable.
>
> If executable unmaps its all VM_EXECUTABLE mappings, /proc/*/exe ceases
> to exists, ick! And userpsace can't even use MAP_EXECUTABLE.

Suprising but intentional and unavoidable. More below..

>
> Tasks which aren't created by running clone(2) and execve(2)
> (read: kernel threads) get empty ->exec_path and
>
> ->exec_path is copied on clone(2) and put at do_exit() time.

Doesn't this pin the vfs mount of the executable for the lifetime of
the task?

That was one of Al Viro's objections to early revisions of the exe_file
patches. It's the reason the exe_file patches kept track of the number of
VM_EXECUTABLE mappings with num_exe_file_vmas.

I've cc'd Al so he can confirm/deny my recollection of this. Basically
some programs need to be able to umount the filesystem that back their
executables. Being able to unmap these regions was effectively a
userspace API for unpinning these mounts. I needed to preserve that API,
hence the VMA ugliness of exe_file that you object to with the exe_file
patches.

I think patches 2-7 look great and could be adapted to use exe_file instead
of ->exec_path.

Cheers,
-Matt Helsley

>
> ->exec_path is going to replace struct mm_struct::exe_file et al
> and allows to remove VM_EXECUTABLE flag while keeping readlink("/proc/*/exe")
> without loop over all VMAs.
>
> Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx>
> ---
> fs/binfmt_aout.c | 1 +
> fs/binfmt_elf.c | 1 +
> fs/binfmt_elf_fdpic.c | 1 +
> fs/binfmt_flat.c | 1 +
> fs/binfmt_som.c | 1 +
> fs/proc/base.c | 38 ++++++++++++++------------------------
> include/linux/sched.h | 25 +++++++++++++++++++++++++
> kernel/exit.c | 1 +
> kernel/fork.c | 2 ++
> 9 files changed, 47 insertions(+), 24 deletions(-)
>
> diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
> index b639dcf..a19b185 100644
> --- a/fs/binfmt_aout.c
> +++ b/fs/binfmt_aout.c
> @@ -379,6 +379,7 @@ beyond_if:
> regs->gp = ex.a_gpvalue;
> #endif
> start_thread(regs, ex.a_entry, current->mm->start_stack);
> + set_task_exec_path(current, &bprm->file->f_path);
> return 0;
> }
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 40381df..b815bfc 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -999,6 +999,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
> #endif
>
> start_thread(regs, elf_entry, bprm->p);
> + set_task_exec_path(current, &bprm->file->f_path);
> retval = 0;
> out:
> kfree(loc);
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index fdb66fa..f545504 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -1185,6 +1185,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
> seg++;
> }
>
> + set_task_exec_path(current, &file->f_path);
> return 0;
> }
>
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 697f6b5..a16f977 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -798,6 +798,7 @@ static int load_flat_file(struct linux_binprm * bprm,
> libinfo->lib_list[id].start_brk) + /* start brk */
> stack_len);
>
> + set_task_exec_path(current, &bprm->file->f_path);
> return 0;
> err:
> return ret;
> diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
> index eff74b9..6c56262 100644
> --- a/fs/binfmt_som.c
> +++ b/fs/binfmt_som.c
> @@ -174,6 +174,7 @@ static int map_som_binary(struct file *file,
> up_write(&current->mm->mmap_sem);
> if (retval > 0 || retval < -1024)
> retval = 0;
> + set_task_exec_path(current, &bprm->file->f_path);
> out:
> set_fs(old_fs);
> return retval;
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 3326bbf..dc4ee6a 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -201,6 +201,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
> return result;
> }
>
> +static int proc_exe_link(struct inode *inode, struct path *path)
> +{
> + struct task_struct *tsk;
> +
> + tsk = get_proc_task(inode);
> + if (!tsk)
> + return -ENOENT;
> + get_task_exec_path(tsk, path);
> + put_task_struct(tsk);
> + if (!path->mnt || !path->dentry)
> + return -ENOENT;
> + return 0;
> +}
> +
> /*
> * Return zero if current may access user memory in @task, -error if not.
> */
> @@ -1302,30 +1316,6 @@ void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
> newmm->exe_file = get_mm_exe_file(oldmm);
> }
>
> -static int proc_exe_link(struct inode *inode, struct path *exe_path)
> -{
> - struct task_struct *task;
> - struct mm_struct *mm;
> - struct file *exe_file;
> -
> - task = get_proc_task(inode);
> - if (!task)
> - return -ENOENT;
> - mm = get_task_mm(task);
> - put_task_struct(task);
> - if (!mm)
> - return -ENOENT;
> - exe_file = get_mm_exe_file(mm);
> - mmput(mm);
> - if (exe_file) {
> - *exe_path = exe_file->f_path;
> - path_get(&exe_file->f_path);
> - fput(exe_file);
> - return 0;
> - } else
> - return -ENOENT;
> -}
> -
> static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
> {
> struct inode *inode = dentry->d_inode;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b4c38bc..6b2dd01 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1265,6 +1265,12 @@ struct task_struct {
> #endif
> /* CPU-specific state of this task */
> struct thread_struct thread;
> + /*
> + * Executable, binfmt loader wants to associate with task
> + * (read: execve(2) argument).
> + * Empty, if concept isn't applicable, e. g. kernel thread.
> + */
> + struct path exec_path;
> /* filesystem information */
> struct fs_struct *fs;
> /* open file information */
> @@ -2403,6 +2409,25 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
>
> #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
>
> +static inline void get_task_exec_path(struct task_struct *tsk, struct path *path)
> +{
> + task_lock(tsk);
> + path_get(&tsk->exec_path);
> + *path = tsk->exec_path;
> + task_unlock(tsk);
> +}
> +
> +static inline void set_task_exec_path(struct task_struct *tsk, struct path *path)
> +{
> + struct path old_path;
> +
> + path_get(path);
> + task_lock(tsk);
> + old_path = tsk->exec_path;
> + tsk->exec_path = *path;
> + task_unlock(tsk);
> + path_put(&old_path);
> +}
> #endif /* __KERNEL__ */
>
> #endif
> diff --git a/kernel/exit.c b/kernel/exit.c
> index abf9cf3..8e70b54 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -962,6 +962,7 @@ NORET_TYPE void do_exit(long code)
>
> exit_sem(tsk);
> exit_files(tsk);
> + set_task_exec_path(tsk, &(struct path){ .mnt = NULL, .dentry = NULL });
> exit_fs(tsk);
> check_stack_usage();
> exit_thread();
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b9e2edd..c0ee931 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1191,6 +1191,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> cgroup_fork_callbacks(p);
> cgroup_callbacks_done = 1;
>
> + get_task_exec_path(current, &p->exec_path);
> +
> /* Need tasklist lock for parent etc handling! */
> write_lock_irq(&tasklist_lock);
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/