[RFC PATCH v1 4/4] Allow to change the user namespace in which user rlimits are counted

From: Alexey Gladkov
Date: Mon Nov 02 2020 - 11:52:40 EST


Add a new prctl to change the user namespace in which the process
counter is located. A pointer to the user namespace is in cred struct
to be inherited by all child processes.

Signed-off-by: Alexey Gladkov <gladkov.alexey@xxxxxxxxx>
---
fs/exec.c | 2 +-
fs/io-wq.c | 13 ++++++++-----
fs/io-wq.h | 1 +
fs/io_uring.c | 1 +
include/linux/cred.h | 8 ++++++++
include/uapi/linux/prctl.h | 5 +++++
kernel/cred.c | 35 +++++++++++++++++++++++++++++------
kernel/exit.c | 2 +-
kernel/fork.c | 4 ++--
kernel/sys.c | 22 +++++++++++++++++++++-
kernel/user_namespace.c | 3 +++
11 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index c45dfc716394..574b1381276c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename,
goto out_ret;
}

- processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC);

/*
* We move the actual failure in case of RLIMIT_NPROC excess from
diff --git a/fs/io-wq.c b/fs/io-wq.c
index c3b0843abc9b..19e43ec115cb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -116,6 +116,7 @@ struct io_wq {

struct task_struct *manager;
struct user_struct *user;
+ const struct cred *creds;
refcount_t refs;
struct completion done;

@@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
if (!(worker->flags & IO_WORKER_F_BOUND))
- dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
worker->flags = 0;
preempt_enable();

@@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
} else {
- if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC))
return;
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
@@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
}

if (index == IO_WQ_ACCT_UNBOUND &&
- !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
+ !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) {
kfree(worker);
return false;
}
@@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
if (free_worker)
return true;

- processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid,
+ UCOUNT_RLIMIT_NPROC);

if (processes >= acct->max_workers &&
!(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
@@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

/* caller must already hold a reference to this */
wq->user = data->user;
+ wq->creds = data->creds;

for_each_node(node) {
struct io_wqe *wqe;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..6acc3a04c38f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **);

struct io_wq_data {
struct user_struct *user;
+ const struct cred *creds;

io_wq_work_fn *do_work;
free_work_fn *free_work;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..e419923968b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
int ret = 0;

data.user = ctx->user;
+ data.creds = ctx->creds;
data.free_work = io_free_work;
data.do_work = io_wq_submit_work;

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..43aee68d117f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -144,6 +144,7 @@ struct cred {
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+ struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */
struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */
union {
@@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
+extern int set_rlimit_ns(struct user_namespace *ns);

/*
* check for validity of credentials
@@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)

#define task_uid(task) (task_cred_xxx((task), uid))
#define task_euid(task) (task_cred_xxx((task), euid))
+#define task_rlimit_ns(task) (task_cred_xxx((task), rlimit_ns))

#define current_cred_xxx(xxx) \
({ \
@@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred)
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns() (current_cred_xxx(user_ns))
+#define current_rlimit_ns() (current_cred_xxx(rlimit_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
return &init_user_ns;
}
+static inline struct user_namespace *current_rlimit_ns(void)
+{
+ return &init_user_ns;
+}
#endif


diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..4f853f903415 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,9 @@ struct prctl_mm_map {
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58

+#define PR_SET_RLIMIT_USER_NAMESPACE 59
+#define PR_GET_RLIMIT_USER_NAMESPACE 60
+# define PR_RLIMIT_BIND_GLOBAL_USERNS (1UL << 0)
+# define PR_RLIMIT_BIND_CURRENT_USERNS (1UL << 1)
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 748704db1f6b..7b90e1ef9c9a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -59,6 +59,7 @@ struct cred init_cred = {
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
+ .rlimit_ns = &init_user_ns,
.group_info = &init_groups,
};

@@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
put_group_info(cred->group_info);
free_uid(cred->user);
put_user_ns(cred->user_ns);
+ put_user_ns(cred->rlimit_ns);
kmem_cache_free(cred_jar, cred);
}

@@ -270,6 +272,7 @@ struct cred *prepare_creds(void)
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
+ get_user_ns(new->rlimit_ns);

#ifdef CONFIG_KEYS
key_get(new->session_keyring);
@@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
- if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC))
return -EACCES;
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
@@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif

- if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
return -EACCES;
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
@@ -487,13 +490,13 @@ int commit_creds(struct cred *new)
* perhaps this limit is exceeded in the parent user namespace.
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user &&
- !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC))
+ if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) &&
+ !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC))
task->flags |= PF_NPROC_UNS_EXCEEDED;
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
- dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC);
+ if (new->user != old->user || new->rlimit_ns != old->rlimit_ns)
+ dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC);
alter_cred_subscribers(old, -2);

/* send notifications */
@@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode)
}
EXPORT_SYMBOL(set_create_files_as);

+/*
+ * Change the rlimit user namespace of the current task, replacing the existing
+ * one. If the given namespace is NULL, then initial user namespace will be
+ * used.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
+int set_rlimit_ns(struct user_namespace *ns)
+{
+ struct cred *new;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ new->rlimit_ns = ns ? ns : &init_user_ns;
+
+ return commit_creds(new);
+}
+
#ifdef CONFIG_DEBUG_CREDENTIALS

bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5a0d7dd1ad64..998436d32373 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -187,7 +187,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
rcu_read_unlock();

cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2b28634dc8f..43f3c54fe4c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process(
current->flags &= ~PF_NPROC_UNS_EXCEEDED;
goto bad_fork_free;
}
- processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid,
+ processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p),
UCOUNT_RLIMIT_NPROC);
if (processes >= task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
@@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC);
+ dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index db780ec32d86..917cbd7fc674 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -467,7 +467,7 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;

- processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);
+ processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC);

/*
* We don't fail in case of NPROC limit excess here because too many
@@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,

error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_RLIMIT_USER_NAMESPACE:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ switch (arg2) {
+ case PR_RLIMIT_BIND_GLOBAL_USERNS:
+ error = set_rlimit_ns(&init_user_ns);
+ break;
+ case PR_RLIMIT_BIND_CURRENT_USERNS:
+ error = set_rlimit_ns(current_user_ns());
+ break;
+ default:
+ error = -EINVAL;
+ }
+ break;
+ case PR_GET_RLIMIT_USER_NAMESPACE:
+ error = current_rlimit_ns() == &init_user_ns
+ ? PR_RLIMIT_BIND_GLOBAL_USERNS
+ : PR_RLIMIT_BIND_CURRENT_USERNS;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0371fe..346df35ceba9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
#endif
/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
cred->user_ns = user_ns;
+
+ cred->rlimit_ns = &init_user_ns;
}

/*
@@ -121,6 +123,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < UCOUNT_COUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucounts = ucounts;

/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
--
2.25.4