Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API

From: Andrei Vagin
Date: Thu Oct 19 2017 - 03:31:13 EST


Hi Gargi,

This patch breaks CRIU, because it changes a meaning of ns_last_pid.

========================== Run zdtm/static/env00 in h ==========================
DEP env00.d
CC env00.o
LINK env00
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/env00/52/1/restore.log
------------------------ grep Error ------------------------
(00.000587) No mountpoints-6.img image
(00.000593) mnt: Reading mountpoint images (id 6 pid 52)
(00.000653) Forking task with 52 pid (flags 0x0)
(00.007568) PID: real 51 virt 52
(00.010363) 52: Error (criu/cr-restore.c:1787): Pid 51 do not match expected 52 (task 52)
(00.010474) Error (criu/cr-restore.c:2449): Restoring FAILED.
------------------------ ERROR OVER ------------------------

Before this patch, ns_last_pid contains a pid of a last process. With
this patch, it contains a pid of a "next" process.

In CRIU we use ns_last_pid to restore a process with a specified pid,
and now this logic is broken:

$ uname -a
Linux laptop 4.11.11-200.fc25.x86_64 #1 SMP Mon Jul 17 17:41:12 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
$ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
20000

$ uname -a
Linux fc24 4.14.0-rc5-next-20171018 #1 SMP Wed Oct 18 23:52:43 PDT 2017 x86_64 x86_64 x86_64 GNU/Linux
$ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
19999

Thanks,
Andrei

On Wed, Oct 11, 2017 at 06:19:38PM -0400, Gargi Sharma wrote:
> This patch replaces the current bitmap implemetation for
> Process ID allocation. Functions that are no longer required,
> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
> The rest of the functions are modified to use the IDR API.
> The change was made to make the PID allocation less complex by
> replacing custom code with calls to generic API.
>
> Signed-off-by: Gargi Sharma <gs051095@xxxxxxxxx>
> Reviewed-by: Rik van Riel <riel@xxxxxxxxxx>
> ---
> arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
> fs/proc/loadavg.c | 2 +-
> include/linux/pid_namespace.h | 14 +--
> init/main.c | 2 +-
> kernel/pid.c | 201 ++++++------------------------
> kernel/pid_namespace.c | 44 +++----
> 6 files changed, 57 insertions(+), 208 deletions(-)
>
> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
> index 1fbb5da..e47761c 100644
> --- a/arch/powerpc/platforms/cell/spufs/sched.c
> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
> LOAD_INT(c), LOAD_FRAC(c),
> count_active_contexts(),
> atomic_read(&nr_spu_contexts),
> - task_active_pid_ns(current)->last_pid);
> + idr_get_cursor(&task_active_pid_ns(current)->idr));
> return 0;
> }
>
> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
> index 983fce5..ba3d0e2 100644
> --- a/fs/proc/loadavg.c
> +++ b/fs/proc/loadavg.c
> @@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
> LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
> LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
> nr_running(), nr_threads,
> - task_active_pid_ns(current)->last_pid);
> + idr_get_cursor(&task_active_pid_ns(current)->idr));
> return 0;
> }
>
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index b09136f..f4db4a7 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -9,15 +9,8 @@
> #include <linux/nsproxy.h>
> #include <linux/kref.h>
> #include <linux/ns_common.h>
> +#include <linux/idr.h>
>
> -struct pidmap {
> - atomic_t nr_free;
> - void *page;
> -};
> -
> -#define BITS_PER_PAGE (PAGE_SIZE * 8)
> -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
> -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
>
> struct fs_pin;
>
> @@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
>
> struct pid_namespace {
> struct kref kref;
> - struct pidmap pidmap[PIDMAP_ENTRIES];
> + struct idr idr;
> struct rcu_head rcu;
> - int last_pid;
> unsigned int nr_hashed;
> struct task_struct *child_reaper;
> struct kmem_cache *pid_cachep;
> @@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>
> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
> void pidhash_init(void);
> -void pidmap_init(void);
> +void pid_idr_init(void);
>
> #endif /* _LINUX_PID_NS_H */
> diff --git a/init/main.c b/init/main.c
> index 0ee9c686..9f4db20 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
> if (late_time_init)
> late_time_init();
> calibrate_delay();
> - pidmap_init();
> + pid_idr_init();
> anon_vma_init();
> acpi_early_init();
> #ifdef CONFIG_X86
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 020dedb..0ce5936 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -39,6 +39,7 @@
> #include <linux/proc_ns.h>
> #include <linux/proc_fs.h>
> #include <linux/sched/task.h>
> +#include <linux/idr.h>
>
> #define pid_hashfn(nr, ns) \
> hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
> @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
> int pid_max_min = RESERVED_PIDS + 1;
> int pid_max_max = PID_MAX_LIMIT;
>
> -static inline int mk_pid(struct pid_namespace *pid_ns,
> - struct pidmap *map, int off)
> -{
> - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
> -}
> -
> -#define find_next_offset(map, off) \
> - find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
>
> /*
> * PID-map pages start out as NULL, they get allocated upon
> @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
> */
> struct pid_namespace init_pid_ns = {
> .kref = KREF_INIT(2),
> - .pidmap = {
> - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
> - },
> - .last_pid = 0,
> + .idr = IDR_INIT,
> .nr_hashed = PIDNS_HASH_ADDING,
> .level = 0,
> .child_reaper = &init_task,
> @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
>
> static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
>
> -static void free_pidmap(struct upid *upid)
> -{
> - int nr = upid->nr;
> - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
> - int offset = nr & BITS_PER_PAGE_MASK;
> -
> - clear_bit(offset, map->page);
> - atomic_inc(&map->nr_free);
> -}
> -
> -/*
> - * If we started walking pids at 'base', is 'a' seen before 'b'?
> - */
> -static int pid_before(int base, int a, int b)
> -{
> - /*
> - * This is the same as saying
> - *
> - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
> - * and that mapping orders 'a' and 'b' with respect to 'base'.
> - */
> - return (unsigned)(a - base) < (unsigned)(b - base);
> -}
> -
> -/*
> - * We might be racing with someone else trying to set pid_ns->last_pid
> - * at the pid allocation time (there's also a sysctl for this, but racing
> - * with this one is OK, see comment in kernel/pid_namespace.c about it).
> - * We want the winner to have the "later" value, because if the
> - * "earlier" value prevails, then a pid may get reused immediately.
> - *
> - * Since pids rollover, it is not sufficient to just pick the bigger
> - * value. We have to consider where we started counting from.
> - *
> - * 'base' is the value of pid_ns->last_pid that we observed when
> - * we started looking for a pid.
> - *
> - * 'pid' is the pid that we eventually found.
> - */
> -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
> -{
> - int prev;
> - int last_write = base;
> - do {
> - prev = last_write;
> - last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
> - } while ((prev != last_write) && (pid_before(base, last_write, pid)));
> -}
> -
> -static int alloc_pidmap(struct pid_namespace *pid_ns)
> -{
> - int i, offset, max_scan, pid, last = pid_ns->last_pid;
> - struct pidmap *map;
> -
> - pid = last + 1;
> - if (pid >= pid_max)
> - pid = RESERVED_PIDS;
> - offset = pid & BITS_PER_PAGE_MASK;
> - map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
> - /*
> - * If last_pid points into the middle of the map->page we
> - * want to scan this bitmap block twice, the second time
> - * we start with offset == 0 (or RESERVED_PIDS).
> - */
> - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
> - for (i = 0; i <= max_scan; ++i) {
> - if (unlikely(!map->page)) {
> - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> - /*
> - * Free the page if someone raced with us
> - * installing it:
> - */
> - spin_lock_irq(&pidmap_lock);
> - if (!map->page) {
> - map->page = page;
> - page = NULL;
> - }
> - spin_unlock_irq(&pidmap_lock);
> - kfree(page);
> - if (unlikely(!map->page))
> - return -ENOMEM;
> - }
> - if (likely(atomic_read(&map->nr_free))) {
> - for ( ; ; ) {
> - if (!test_and_set_bit(offset, map->page)) {
> - atomic_dec(&map->nr_free);
> - set_last_pid(pid_ns, last, pid);
> - return pid;
> - }
> - offset = find_next_offset(map, offset);
> - if (offset >= BITS_PER_PAGE)
> - break;
> - pid = mk_pid(pid_ns, map, offset);
> - if (pid >= pid_max)
> - break;
> - }
> - }
> - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
> - ++map;
> - offset = 0;
> - } else {
> - map = &pid_ns->pidmap[0];
> - offset = RESERVED_PIDS;
> - if (unlikely(last == offset))
> - break;
> - }
> - pid = mk_pid(pid_ns, map, offset);
> - }
> - return -EAGAIN;
> -}
> -
> -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
> -{
> - int offset;
> - struct pidmap *map, *end;
> -
> - if (last >= PID_MAX_LIMIT)
> - return -1;
> -
> - offset = (last + 1) & BITS_PER_PAGE_MASK;
> - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
> - end = &pid_ns->pidmap[PIDMAP_ENTRIES];
> - for (; map < end; map++, offset = 0) {
> - if (unlikely(!map->page))
> - continue;
> - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
> - if (offset < BITS_PER_PAGE)
> - return mk_pid(pid_ns, map, offset);
> - }
> - return -1;
> -}
> -
> void put_pid(struct pid *pid)
> {
> struct pid_namespace *ns;
> @@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
> struct upid *upid = pid->numbers + i;
> struct pid_namespace *ns = upid->ns;
> hlist_del_rcu(&upid->pid_chain);
> - switch(--ns->nr_hashed) {
> + switch (--ns->nr_hashed) {
> case 2:
> case 1:
> /* When all that is left in the pid namespace
> @@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
> schedule_work(&ns->proc_work);
> break;
> }
> +
> + idr_remove(&ns->idr, upid->nr);
> }
> spin_unlock_irqrestore(&pidmap_lock, flags);
>
> - for (i = 0; i <= pid->level; i++)
> - free_pidmap(pid->numbers + i);
> -
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>
> tmp = ns;
> pid->level = ns->level;
> +
> for (i = ns->level; i >= 0; i--) {
> - nr = alloc_pidmap(tmp);
> + int pid_min = 1;
> +
> + idr_preload(GFP_KERNEL);
> + spin_lock_irq(&pidmap_lock);
> +
> + /*
> + * init really needs pid 1, but after reaching the maximum
> + * wrap back to RESERVED_PIDS
> + */
> + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> + pid_min = RESERVED_PIDS;
> +
> + /*
> + * Store a null pointer so find_pid_ns does not find
> + * a partially initialized PID (see below).
> + */
> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> + spin_unlock_irq(&pidmap_lock);
> + idr_preload_end();
> +
> if (nr < 0) {
> retval = nr;
> goto out_free;
> @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> for ( ; upid >= pid->numbers; --upid) {
> hlist_add_head_rcu(&upid->pid_chain,
> &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
> + /* Make the PID visible to find_pid_ns. */
> + idr_replace(&upid->ns->idr, pid, upid->nr);
> upid->ns->nr_hashed++;
> }
> spin_unlock_irq(&pidmap_lock);
> @@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> put_pid_ns(ns);
>
> out_free:
> + spin_lock_irq(&pidmap_lock);
> while (++i <= ns->level)
> - free_pidmap(pid->numbers + i);
> + idr_remove(&ns->idr, (pid->numbers + i)->nr);
> +
> + spin_unlock_irq(&pidmap_lock);
>
> kmem_cache_free(ns->pid_cachep, pid);
> return ERR_PTR(retval);
> @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
> */
> struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> {
> - struct pid *pid;
> -
> - do {
> - pid = find_pid_ns(nr, ns);
> - if (pid)
> - break;
> - nr = next_pidmap(ns, nr);
> - } while (nr > 0);
> -
> - return pid;
> + return idr_get_next(&ns->idr, &nr);
> }
>
> /*
> @@ -578,7 +452,7 @@ void __init pidhash_init(void)
> 0, 4096);
> }
>
> -void __init pidmap_init(void)
> +void __init pid_idr_init(void)
> {
> /* Verify no one has done anything silly: */
> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
> @@ -590,10 +464,7 @@ void __init pidmap_init(void)
> PIDS_PER_CPU_MIN * num_possible_cpus());
> pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
>
> - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> - /* Reserve PID 0. We never call free_pidmap(0) */
> - set_bit(0, init_pid_ns.pidmap[0].page);
> - atomic_dec(&init_pid_ns.pidmap[0].nr_free);
> + idr_init(&init_pid_ns.idr);
>
> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 4918314..5009dbe 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -21,6 +21,7 @@
> #include <linux/export.h>
> #include <linux/sched/task.h>
> #include <linux/sched/signal.h>
> +#include <linux/idr.h>
>
> struct pid_cache {
> int nr_ids;
> @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> struct pid_namespace *ns;
> unsigned int level = parent_pid_ns->level + 1;
> struct ucounts *ucounts;
> - int i;
> int err;
>
> err = -EINVAL;
> @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> if (ns == NULL)
> goto out_dec;
>
> - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> - if (!ns->pidmap[0].page)
> - goto out_free;
> + idr_init(&ns->idr);
>
> ns->pid_cachep = create_pid_cachep(level + 1);
> if (ns->pid_cachep == NULL)
> - goto out_free_map;
> + goto out_free_idr;
>
> err = ns_alloc_inum(&ns->ns);
> if (err)
> - goto out_free_map;
> + goto out_free_idr;
> ns->ns.ops = &pidns_operations;
>
> kref_init(&ns->kref);
> @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->nr_hashed = PIDNS_HASH_ADDING;
> INIT_WORK(&ns->proc_work, proc_cleanup_work);
>
> - set_bit(0, ns->pidmap[0].page);
> - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
> -
> - for (i = 1; i < PIDMAP_ENTRIES; i++)
> - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
> -
> return ns;
>
> -out_free_map:
> - kfree(ns->pidmap[0].page);
> -out_free:
> +out_free_idr:
> + idr_destroy(&ns->idr);
> kmem_cache_free(pid_ns_cachep, ns);
> out_dec:
> dec_pid_namespaces(ucounts);
> @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
>
> static void destroy_pid_namespace(struct pid_namespace *ns)
> {
> - int i;
> -
> ns_free_inum(&ns->ns);
> - for (i = 0; i < PIDMAP_ENTRIES; i++)
> - kfree(ns->pidmap[i].page);
> +
> + idr_destroy(&ns->idr);
> call_rcu(&ns->rcu, delayed_free_pidns);
> }
>
> @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> int rc;
> struct task_struct *task, *me = current;
> int init_pids = thread_group_leader(me) ? 1 : 2;
> + struct pid *pid;
>
> /* Don't allow any more processes into the pid namespace */
> disable_pid_allocation(pid_ns);
> @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> * maintain a tasklist for each pid namespace.
> *
> */
> + rcu_read_lock();
> read_lock(&tasklist_lock);
> - nr = next_pidmap(pid_ns, 1);
> - while (nr > 0) {
> - rcu_read_lock();
> -
> - task = pid_task(find_vpid(nr), PIDTYPE_PID);
> + nr = 2;
> + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
> + task = pid_task(pid, PIDTYPE_PID);
> if (task && !__fatal_signal_pending(task))
> send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
> -
> - rcu_read_unlock();
> -
> - nr = next_pidmap(pid_ns, nr);
> }
> read_unlock(&tasklist_lock);
> + rcu_read_unlock();
>
> /*
> * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
> @@ -311,7 +297,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> * it should synchronize its usage with external means.
> */
>
> - tmp.data = &pid_ns->last_pid;
> + tmp.data = &pid_ns->idr.idr_next;
> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> }
>