Re: [PATCH V3] binder: ipc namespace support for android binder

From: Todd Kjos
Date: Fri Nov 09 2018 - 12:54:36 EST


On Thu, Nov 8, 2018 at 5:02 AM chouryzhou(åå) <chouryzhou@xxxxxxxxxxx> wrote:
>
> We are working for running android in container, but we found that binder is
> not isolated by ipc namespace. Since binder is a form of IPC and therefore should
> be tied to ipc namespace. With this patch, we can run more than one android
> container on one host.
> This patch move "binder_procs" and "binder_context" into ipc_namespace,
> driver will find the context from it when opening. Although statistics in debugfs
> remain global.
>
> Signed-off-by: chouryzhou <chouryzhou@xxxxxxxxxxx>
> ---
> drivers/android/binder.c | 128 +++++++++++++++++++++++++++++++-----------
> include/linux/ipc_namespace.h | 15 +++++
> ipc/namespace.c | 10 +++-
> 3 files changed, 117 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> index cb30a524d16d..22e45bb937e6 100644
> --- a/drivers/android/binder.c
> +++ b/drivers/android/binder.c
> @@ -68,6 +68,7 @@
> #include <linux/seq_file.h>
> #include <linux/uaccess.h>
> #include <linux/pid_namespace.h>
> +#include <linux/ipc_namespace.h>
> #include <linux/security.h>
> #include <linux/spinlock.h>
> #include <linux/ratelimit.h>
> @@ -80,13 +81,18 @@
> #include "binder_alloc.h"
> #include "binder_trace.h"
>
> +
> +#if !defined(CONFIG_SYSVIPC) && !defined(CONFIG_POSIX_MQUEUE)

I still don't understand the dependencies on SYSVIPC or POSIX_MQUEUE.
It seems like this mechanism would work even if both are disabled --
as long as IPC_NS is enabled. Seems cleaner to change init/Kconfig and
allow IPC_NS if CONFIG_ANDROID_BINDER_IPC and change this line to
"#ifndef CONFIG_IPC_NS"

> +struct ipc_namespace init_ipc_ns;
> +#define ipcns (&init_ipc_ns)
> +#else
> +#define ipcns (current->nsproxy->ipc_ns)
> +#endif
> +
> static HLIST_HEAD(binder_deferred_list);
> static DEFINE_MUTEX(binder_deferred_lock);
>
> static HLIST_HEAD(binder_devices);
> -static HLIST_HEAD(binder_procs);
> -static DEFINE_MUTEX(binder_procs_lock);
> -
> static HLIST_HEAD(binder_dead_nodes);
> static DEFINE_SPINLOCK(binder_dead_nodes_lock);
>
> @@ -232,7 +238,7 @@ struct binder_transaction_log_entry {
> int return_error_line;
> uint32_t return_error;
> uint32_t return_error_param;
> - const char *context_name;
> + int context_device;
> };
> struct binder_transaction_log {
> atomic_t cur;
> @@ -263,19 +269,64 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
> }
>
> struct binder_context {
> + struct hlist_node hlist;
> struct binder_node *binder_context_mgr_node;
> struct mutex context_mgr_node_lock;
>
> kuid_t binder_context_mgr_uid;
> - const char *name;
> + int device;
> };
>
> struct binder_device {
> struct hlist_node hlist;
> struct miscdevice miscdev;
> - struct binder_context context;
> };
>
> +void binder_exit_ns(struct ipc_namespace *ns)
> +{
> + struct binder_context *context;
> + struct hlist_node *tmp;
> +
> + mutex_destroy(&ns->binder_procs_lock);
> + hlist_for_each_entry_safe(context, tmp, &ns->binder_contexts, hlist) {
> + mutex_destroy(&context->context_mgr_node_lock);
> + hlist_del(&context->hlist);
> + kfree(context);
> + }
> +}
> +
> +int binder_init_ns(struct ipc_namespace *ns)
> +{
> + int ret;
> + struct binder_device *device;
> +
> + mutex_init(&ns->binder_procs_lock);
> + INIT_HLIST_HEAD(&ns->binder_procs);
> + INIT_HLIST_HEAD(&ns->binder_contexts);
> +
> + hlist_for_each_entry(device, &binder_devices, hlist) {
> + struct binder_context *context;
> +
> + context = kzalloc(sizeof(*context), GFP_KERNEL);
> + if (!context) {
> + ret = -ENOMEM;
> + goto err;
> + }
> +
> + context->device = device->miscdev.minor;
> + context->binder_context_mgr_uid = INVALID_UID;
> + mutex_init(&context->context_mgr_node_lock);
> +
> + hlist_add_head(&context->hlist, &ns->binder_contexts);
> + }
> +
> + return 0;
> +err:
> + binder_exit_ns(ns);
> + return ret;
> +}
> +
> +
> /**
> * struct binder_work - work enqueued on a worklist
> * @entry: node enqueued on list
> @@ -2727,7 +2778,7 @@ static void binder_transaction(struct binder_proc *proc,
> e->target_handle = tr->target.handle;
> e->data_size = tr->data_size;
> e->offsets_size = tr->offsets_size;
> - e->context_name = proc->context->name;
> + e->context_device = proc->context->device;

why eliminate name? The string name is very useful for differentiating
normal "framework" binder transactions vs "hal" or "vendor"
transactions. If we just have a device number it will be hard to tell
in the logs even which namespace it belongs to. We need to keep both
the "name" (for which there might be multiple in each ns) and some
indication of which namespace this is. Maybe we assign some sort of
namespace ID during binder_init_ns().

>
> if (reply) {
> binder_inner_proc_lock(proc);
> @@ -4922,6 +4973,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
> {
> struct binder_proc *proc;
> struct binder_device *binder_dev;
> + struct binder_context *context;
>
> binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__,
> current->group_leader->pid, current->pid);
> @@ -4937,7 +4989,15 @@ static int binder_open(struct inode *nodp, struct file *filp)
> proc->default_priority = task_nice(current);
> binder_dev = container_of(filp->private_data, struct binder_device,
> miscdev);
> - proc->context = &binder_dev->context;
> + hlist_for_each_entry(context, &ipcns->binder_contexts, hlist) {
> + if (context->device == binder_dev->miscdev.minor) {
> + proc->context = context;
> + break;
> + }
> + }
> + if (!proc->context)
> + return -ENOENT;
> +
> binder_alloc_init(&proc->alloc);
>
> binder_stats_created(BINDER_STAT_PROC);
> @@ -4946,9 +5006,9 @@ static int binder_open(struct inode *nodp, struct file *filp)
> INIT_LIST_HEAD(&proc->waiting_threads);
> filp->private_data = proc;
>
> - mutex_lock(&binder_procs_lock);
> - hlist_add_head(&proc->proc_node, &binder_procs);
> - mutex_unlock(&binder_procs_lock);
> + mutex_lock(&ipcns->binder_procs_lock);
> + hlist_add_head(&proc->proc_node, &ipcns->binder_procs);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> if (binder_debugfs_dir_entry_proc) {
> char strbuf[11];
> @@ -5082,9 +5142,9 @@ static void binder_deferred_release(struct binder_proc *proc)
> struct rb_node *n;
> int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
>
> - mutex_lock(&binder_procs_lock);
> + mutex_lock(&ipcns->binder_procs_lock);
> hlist_del(&proc->proc_node);
> - mutex_unlock(&binder_procs_lock);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> mutex_lock(&context->context_mgr_node_lock);
> if (context->binder_context_mgr_node &&
> @@ -5377,7 +5437,7 @@ static void print_binder_proc(struct seq_file *m,
> struct binder_node *last_node = NULL;
>
> seq_printf(m, "proc %d\n", proc->pid);
> - seq_printf(m, "context %s\n", proc->context->name);
> + seq_printf(m, "context %d\n", proc->context->device);

As mentioned above, we need to retain name and probably also want a ns
id of some sort. So context now has 3 components if IPC_NS, so maybe a
helper function to print context like:

static void binder_seq_print_context(struct seq_file *m, struct
binder_context *context)
{
#ifdef CONFIG_IPC_NS
seq_printf(m, "%d-%d-%s", context->ns_id, context->device,
context->name);
#else
seq_printf(m, "%d", context->name);
#endif
}

(same comment below everywhere context is printed)

Should these debugfs nodes should be ns aware and only print debugging
info for the context of the thread accessing the node? If so, we would
also want to be namespace-aware when printing pids.

> header_pos = m->count;
>
> binder_inner_proc_lock(proc);
> @@ -5538,7 +5598,7 @@ static void print_binder_proc_stats(struct seq_file *m,
> binder_alloc_get_free_async_space(&proc->alloc);
>
> seq_printf(m, "proc %d\n", proc->pid);
> - seq_printf(m, "context %s\n", proc->context->name);
> + seq_printf(m, "context %d\n", proc->context->device);
> count = 0;
> ready_threads = 0;
> binder_inner_proc_lock(proc);
> @@ -5623,10 +5683,10 @@ static int binder_state_show(struct seq_file *m, void *unused)
> if (last_node)
> binder_put_node(last_node);
>
> - mutex_lock(&binder_procs_lock);
> - hlist_for_each_entry(proc, &binder_procs, proc_node)
> + mutex_lock(&ipcns->binder_procs_lock);
> + hlist_for_each_entry(proc, &ipcns->binder_procs, proc_node)
> print_binder_proc(m, proc, 1);
> - mutex_unlock(&binder_procs_lock);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> return 0;
> }
> @@ -5639,10 +5699,10 @@ static int binder_stats_show(struct seq_file *m, void *unused)
>
> print_binder_stats(m, "", &binder_stats);
>
> - mutex_lock(&binder_procs_lock);
> - hlist_for_each_entry(proc, &binder_procs, proc_node)
> + mutex_lock(&ipcns->binder_procs_lock);
> + hlist_for_each_entry(proc, &ipcns->binder_procs, proc_node)
> print_binder_proc_stats(m, proc);
> - mutex_unlock(&binder_procs_lock);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> return 0;
> }
> @@ -5652,10 +5712,10 @@ static int binder_transactions_show(struct seq_file *m, void *unused)
> struct binder_proc *proc;
>
> seq_puts(m, "binder transactions:\n");
> - mutex_lock(&binder_procs_lock);
> - hlist_for_each_entry(proc, &binder_procs, proc_node)
> + mutex_lock(&ipcns->binder_procs_lock);
> + hlist_for_each_entry(proc, &ipcns->binder_procs, proc_node)
> print_binder_proc(m, proc, 0);
> - mutex_unlock(&binder_procs_lock);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> return 0;
> }
> @@ -5665,14 +5725,14 @@ static int binder_proc_show(struct seq_file *m, void *unused)
> struct binder_proc *itr;
> int pid = (unsigned long)m->private;
>
> - mutex_lock(&binder_procs_lock);
> - hlist_for_each_entry(itr, &binder_procs, proc_node) {
> + mutex_lock(&ipcns->binder_procs_lock);
> + hlist_for_each_entry(itr, &ipcns->binder_procs, proc_node) {
> if (itr->pid == pid) {
> seq_puts(m, "binder proc state:\n");
> print_binder_proc(m, itr, 1);
> }
> }
> - mutex_unlock(&binder_procs_lock);
> + mutex_unlock(&ipcns->binder_procs_lock);
>
> return 0;
> }
> @@ -5687,10 +5747,10 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
> */
> smp_rmb();
> seq_printf(m,
> - "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d",
> + "%d: %s from %d:%d to %d:%d context %d node %d handle %d size %d:%d ret %d/%d l=%d",
> e->debug_id, (e->call_type == 2) ? "reply" :
> ((e->call_type == 1) ? "async" : "call "), e->from_proc,
> - e->from_thread, e->to_proc, e->to_thread, e->context_name,
> + e->from_thread, e->to_proc, e->to_thread, e->context_device,
> e->to_node, e->target_handle, e->data_size, e->offsets_size,
> e->return_error, e->return_error_param,
> e->return_error_line);
> @@ -5753,10 +5813,6 @@ static int __init init_binder_device(const char *name)
> binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
> binder_device->miscdev.name = name;
>
> - binder_device->context.binder_context_mgr_uid = INVALID_UID;
> - binder_device->context.name = name;
> - mutex_init(&binder_device->context.context_mgr_node_lock);
> -
> ret = misc_register(&binder_device->miscdev);
> if (ret < 0) {
> kfree(binder_device);
> @@ -5832,8 +5888,12 @@ static int __init binder_init(void)
> goto err_init_binder_device_failed;
> }
>
> - return ret;
> + ret = binder_init_ns(&init_ipc_ns);
> + if (ret)
> + goto err_init_namespace_failed;
>
> + return ret;
> +err_init_namespace_failed:
> err_init_binder_device_failed:
> hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
> misc_deregister(&device->miscdev);
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index 6ab8c1bada3f..d7f850a2ded8 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -63,6 +63,13 @@ struct ipc_namespace {
> unsigned int mq_msg_default;
> unsigned int mq_msgsize_default;
>
> +#ifdef CONFIG_ANDROID_BINDER_IPC
> + /* next fields are for binder */
> + struct mutex binder_procs_lock;
> + struct hlist_head binder_procs;
> + struct hlist_head binder_contexts;
> +#endif
> +
> /* user_ns which owns the ipc ns */
> struct user_namespace *user_ns;
> struct ucounts *ucounts;
> @@ -118,6 +125,14 @@ extern int mq_init_ns(struct ipc_namespace *ns);
> static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
> #endif
>
> +#ifdef CONFIG_ANDROID_BINDER_IPC
> +extern int binder_init_ns(struct ipc_namespace *ns);
> +extern void binder_exit_ns(struct ipc_namespace *ns);
> +#else
> +static inline int binder_init_ns(struct ipc_namespace *ns) { return 0; }
> +static inline void binder_exit_ns(struct ipc_namespace *ns) { }
> +#endif

I'm fine with this if the namespace.[hc] maintainers are, but would it
be better to add some sort of registration function so we don't need
binder-specific stuff here?

A version of this was proposed by Oren Laadan in:

https://lists.linuxfoundation.org/pipermail/containers/2013-December/033834.html

and then updated for 4.9 in a proposal in AOSP:

https://android-review.googlesource.com/c/kernel/common/+/471961

There will be some discussions on containerized binder support at LPC,
so we'll want to wait until after LPC before moving ahead with this.

-Todd


> +
> #if defined(CONFIG_IPC_NS)
> extern struct ipc_namespace *copy_ipcs(unsigned long flags,
> struct user_namespace *user_ns, struct ipc_namespace *ns);
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 21607791d62c..68c6e983b002 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -57,7 +57,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
>
> err = mq_init_ns(ns);
> if (err)
> - goto fail_put;
> + goto fail_init_mq;
> + err = binder_init_ns(ns);
> + if (err)
> + goto fail_init_binder;
>
> sem_init_ns(ns);
> msg_init_ns(ns);
> @@ -65,7 +68,9 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
>
> return ns;
>
> -fail_put:
> +fail_init_binder:
> + mq_put_mnt(ns);
> +fail_init_mq:
> put_user_ns(ns->user_ns);
> ns_free_inum(&ns->ns);
> fail_free:
> @@ -120,6 +125,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
> sem_exit_ns(ns);
> msg_exit_ns(ns);
> shm_exit_ns(ns);
> + binder_exit_ns(ns);
>
> dec_ipc_namespaces(ns->ucounts);
> put_user_ns(ns->user_ns);
> --
> 2.11.0