Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API

From: Gargi Sharma
Date: Thu Oct 19 2017 - 09:21:06 EST


On Thu, Oct 19, 2017 at 8:30 AM, Andrei Vagin <avagin@xxxxxxxxxxxxx> wrote:
> Hi Gargi,
>
> This patch breaks CRIU, because it changes a meaning of ns_last_pid.
>
> ========================== Run zdtm/static/env00 in h ==========================
> DEP env00.d
> CC env00.o
> LINK env00
> Start test
> ./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
> Run criu dump
> Run criu restore
> =[log]=> dump/zdtm/static/env00/52/1/restore.log
> ------------------------ grep Error ------------------------
> (00.000587) No mountpoints-6.img image
> (00.000593) mnt: Reading mountpoint images (id 6 pid 52)
> (00.000653) Forking task with 52 pid (flags 0x0)
> (00.007568) PID: real 51 virt 52
> (00.010363) 52: Error (criu/cr-restore.c:1787): Pid 51 do not match expected 52 (task 52)
> (00.010474) Error (criu/cr-restore.c:2449): Restoring FAILED.
> ------------------------ ERROR OVER ------------------------
>
> Before this patch, ns_last_pid contains a pid of a last process. With
> this patch, it contains a pid of a "next" process.
>
> In CRIU we use ns_last_pid to restore a process with a specified pid,
> and now this logic is broken:
Hi Andrei,

We looked at the uses of last_pid and did not find critical uses
inside the kernel, hence figured it'll be okay to change it to store
the next pid. But that was not true, I will fix this in the next
version.

Thanks,
Gargi

>
> $ uname -a
> Linux laptop 4.11.11-200.fc25.x86_64 #1 SMP Mon Jul 17 17:41:12 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
> $ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
> 20000
>
> $ uname -a
> Linux fc24 4.14.0-rc5-next-20171018 #1 SMP Wed Oct 18 23:52:43 PDT 2017 x86_64 x86_64 x86_64 GNU/Linux
> $ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
> 19999
>
> Thanks,
> Andrei
>
> On Wed, Oct 11, 2017 at 06:19:38PM -0400, Gargi Sharma wrote:
>> This patch replaces the current bitmap implemetation for
>> Process ID allocation. Functions that are no longer required,
>> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
>> The rest of the functions are modified to use the IDR API.
>> The change was made to make the PID allocation less complex by
>> replacing custom code with calls to generic API.
>>
>> Signed-off-by: Gargi Sharma <gs051095@xxxxxxxxx>
>> Reviewed-by: Rik van Riel <riel@xxxxxxxxxx>
>> ---
>> arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
>> fs/proc/loadavg.c | 2 +-
>> include/linux/pid_namespace.h | 14 +--
>> init/main.c | 2 +-
>> kernel/pid.c | 201 ++++++------------------------
>> kernel/pid_namespace.c | 44 +++----
>> 6 files changed, 57 insertions(+), 208 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
>> index 1fbb5da..e47761c 100644
>> --- a/arch/powerpc/platforms/cell/spufs/sched.c
>> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
>> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
>> LOAD_INT(c), LOAD_FRAC(c),
>> count_active_contexts(),
>> atomic_read(&nr_spu_contexts),
>> - task_active_pid_ns(current)->last_pid);
>> + idr_get_cursor(&task_active_pid_ns(current)->idr));
>> return 0;
>> }
>>
>> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
>> index 983fce5..ba3d0e2 100644
>> --- a/fs/proc/loadavg.c
>> +++ b/fs/proc/loadavg.c
>> @@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
>> LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
>> LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
>> nr_running(), nr_threads,
>> - task_active_pid_ns(current)->last_pid);
>> + idr_get_cursor(&task_active_pid_ns(current)->idr));
>> return 0;
>> }
>>
>> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
>> index b09136f..f4db4a7 100644
>> --- a/include/linux/pid_namespace.h
>> +++ b/include/linux/pid_namespace.h
>> @@ -9,15 +9,8 @@
>> #include <linux/nsproxy.h>
>> #include <linux/kref.h>
>> #include <linux/ns_common.h>
>> +#include <linux/idr.h>
>>
>> -struct pidmap {
>> - atomic_t nr_free;
>> - void *page;
>> -};
>> -
>> -#define BITS_PER_PAGE (PAGE_SIZE * 8)
>> -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
>> -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
>>
>> struct fs_pin;
>>
>> @@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
>>
>> struct pid_namespace {
>> struct kref kref;
>> - struct pidmap pidmap[PIDMAP_ENTRIES];
>> + struct idr idr;
>> struct rcu_head rcu;
>> - int last_pid;
>> unsigned int nr_hashed;
>> struct task_struct *child_reaper;
>> struct kmem_cache *pid_cachep;
>> @@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>>
>> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>> void pidhash_init(void);
>> -void pidmap_init(void);
>> +void pid_idr_init(void);
>>
>> #endif /* _LINUX_PID_NS_H */
>> diff --git a/init/main.c b/init/main.c
>> index 0ee9c686..9f4db20 100644
>> --- a/init/main.c
>> +++ b/init/main.c
>> @@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
>> if (late_time_init)
>> late_time_init();
>> calibrate_delay();
>> - pidmap_init();
>> + pid_idr_init();
>> anon_vma_init();
>> acpi_early_init();
>> #ifdef CONFIG_X86
>> diff --git a/kernel/pid.c b/kernel/pid.c
>> index 020dedb..0ce5936 100644
>> --- a/kernel/pid.c
>> +++ b/kernel/pid.c
>> @@ -39,6 +39,7 @@
>> #include <linux/proc_ns.h>
>> #include <linux/proc_fs.h>
>> #include <linux/sched/task.h>
>> +#include <linux/idr.h>
>>
>> #define pid_hashfn(nr, ns) \
>> hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
>> @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
>> int pid_max_min = RESERVED_PIDS + 1;
>> int pid_max_max = PID_MAX_LIMIT;
>>
>> -static inline int mk_pid(struct pid_namespace *pid_ns,
>> - struct pidmap *map, int off)
>> -{
>> - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
>> -}
>> -
>> -#define find_next_offset(map, off) \
>> - find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
>>
>> /*
>> * PID-map pages start out as NULL, they get allocated upon
>> @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
>> */
>> struct pid_namespace init_pid_ns = {
>> .kref = KREF_INIT(2),
>> - .pidmap = {
>> - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
>> - },
>> - .last_pid = 0,
>> + .idr = IDR_INIT,
>> .nr_hashed = PIDNS_HASH_ADDING,
>> .level = 0,
>> .child_reaper = &init_task,
>> @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
>>
>> static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
>>
>> -static void free_pidmap(struct upid *upid)
>> -{
>> - int nr = upid->nr;
>> - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
>> - int offset = nr & BITS_PER_PAGE_MASK;
>> -
>> - clear_bit(offset, map->page);
>> - atomic_inc(&map->nr_free);
>> -}
>> -
>> -/*
>> - * If we started walking pids at 'base', is 'a' seen before 'b'?
>> - */
>> -static int pid_before(int base, int a, int b)
>> -{
>> - /*
>> - * This is the same as saying
>> - *
>> - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
>> - * and that mapping orders 'a' and 'b' with respect to 'base'.
>> - */
>> - return (unsigned)(a - base) < (unsigned)(b - base);
>> -}
>> -
>> -/*
>> - * We might be racing with someone else trying to set pid_ns->last_pid
>> - * at the pid allocation time (there's also a sysctl for this, but racing
>> - * with this one is OK, see comment in kernel/pid_namespace.c about it).
>> - * We want the winner to have the "later" value, because if the
>> - * "earlier" value prevails, then a pid may get reused immediately.
>> - *
>> - * Since pids rollover, it is not sufficient to just pick the bigger
>> - * value. We have to consider where we started counting from.
>> - *
>> - * 'base' is the value of pid_ns->last_pid that we observed when
>> - * we started looking for a pid.
>> - *
>> - * 'pid' is the pid that we eventually found.
>> - */
>> -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
>> -{
>> - int prev;
>> - int last_write = base;
>> - do {
>> - prev = last_write;
>> - last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
>> - } while ((prev != last_write) && (pid_before(base, last_write, pid)));
>> -}
>> -
>> -static int alloc_pidmap(struct pid_namespace *pid_ns)
>> -{
>> - int i, offset, max_scan, pid, last = pid_ns->last_pid;
>> - struct pidmap *map;
>> -
>> - pid = last + 1;
>> - if (pid >= pid_max)
>> - pid = RESERVED_PIDS;
>> - offset = pid & BITS_PER_PAGE_MASK;
>> - map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
>> - /*
>> - * If last_pid points into the middle of the map->page we
>> - * want to scan this bitmap block twice, the second time
>> - * we start with offset == 0 (or RESERVED_PIDS).
>> - */
>> - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
>> - for (i = 0; i <= max_scan; ++i) {
>> - if (unlikely(!map->page)) {
>> - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>> - /*
>> - * Free the page if someone raced with us
>> - * installing it:
>> - */
>> - spin_lock_irq(&pidmap_lock);
>> - if (!map->page) {
>> - map->page = page;
>> - page = NULL;
>> - }
>> - spin_unlock_irq(&pidmap_lock);
>> - kfree(page);
>> - if (unlikely(!map->page))
>> - return -ENOMEM;
>> - }
>> - if (likely(atomic_read(&map->nr_free))) {
>> - for ( ; ; ) {
>> - if (!test_and_set_bit(offset, map->page)) {
>> - atomic_dec(&map->nr_free);
>> - set_last_pid(pid_ns, last, pid);
>> - return pid;
>> - }
>> - offset = find_next_offset(map, offset);
>> - if (offset >= BITS_PER_PAGE)
>> - break;
>> - pid = mk_pid(pid_ns, map, offset);
>> - if (pid >= pid_max)
>> - break;
>> - }
>> - }
>> - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
>> - ++map;
>> - offset = 0;
>> - } else {
>> - map = &pid_ns->pidmap[0];
>> - offset = RESERVED_PIDS;
>> - if (unlikely(last == offset))
>> - break;
>> - }
>> - pid = mk_pid(pid_ns, map, offset);
>> - }
>> - return -EAGAIN;
>> -}
>> -
>> -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
>> -{
>> - int offset;
>> - struct pidmap *map, *end;
>> -
>> - if (last >= PID_MAX_LIMIT)
>> - return -1;
>> -
>> - offset = (last + 1) & BITS_PER_PAGE_MASK;
>> - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
>> - end = &pid_ns->pidmap[PIDMAP_ENTRIES];
>> - for (; map < end; map++, offset = 0) {
>> - if (unlikely(!map->page))
>> - continue;
>> - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
>> - if (offset < BITS_PER_PAGE)
>> - return mk_pid(pid_ns, map, offset);
>> - }
>> - return -1;
>> -}
>> -
>> void put_pid(struct pid *pid)
>> {
>> struct pid_namespace *ns;
>> @@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
>> struct upid *upid = pid->numbers + i;
>> struct pid_namespace *ns = upid->ns;
>> hlist_del_rcu(&upid->pid_chain);
>> - switch(--ns->nr_hashed) {
>> + switch (--ns->nr_hashed) {
>> case 2:
>> case 1:
>> /* When all that is left in the pid namespace
>> @@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
>> schedule_work(&ns->proc_work);
>> break;
>> }
>> +
>> + idr_remove(&ns->idr, upid->nr);
>> }
>> spin_unlock_irqrestore(&pidmap_lock, flags);
>>
>> - for (i = 0; i <= pid->level; i++)
>> - free_pidmap(pid->numbers + i);
>> -
>> call_rcu(&pid->rcu, delayed_put_pid);
>> }
>>
>> @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>>
>> tmp = ns;
>> pid->level = ns->level;
>> +
>> for (i = ns->level; i >= 0; i--) {
>> - nr = alloc_pidmap(tmp);
>> + int pid_min = 1;
>> +
>> + idr_preload(GFP_KERNEL);
>> + spin_lock_irq(&pidmap_lock);
>> +
>> + /*
>> + * init really needs pid 1, but after reaching the maximum
>> + * wrap back to RESERVED_PIDS
>> + */
>> + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
>> + pid_min = RESERVED_PIDS;
>> +
>> + /*
>> + * Store a null pointer so find_pid_ns does not find
>> + * a partially initialized PID (see below).
>> + */
>> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
>> + pid_max, GFP_ATOMIC);
>> + spin_unlock_irq(&pidmap_lock);
>> + idr_preload_end();
>> +
>> if (nr < 0) {
>> retval = nr;
>> goto out_free;
>> @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>> for ( ; upid >= pid->numbers; --upid) {
>> hlist_add_head_rcu(&upid->pid_chain,
>> &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
>> + /* Make the PID visible to find_pid_ns. */
>> + idr_replace(&upid->ns->idr, pid, upid->nr);
>> upid->ns->nr_hashed++;
>> }
>> spin_unlock_irq(&pidmap_lock);
>> @@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>> put_pid_ns(ns);
>>
>> out_free:
>> + spin_lock_irq(&pidmap_lock);
>> while (++i <= ns->level)
>> - free_pidmap(pid->numbers + i);
>> + idr_remove(&ns->idr, (pid->numbers + i)->nr);
>> +
>> + spin_unlock_irq(&pidmap_lock);
>>
>> kmem_cache_free(ns->pid_cachep, pid);
>> return ERR_PTR(retval);
>> @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
>> */
>> struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
>> {
>> - struct pid *pid;
>> -
>> - do {
>> - pid = find_pid_ns(nr, ns);
>> - if (pid)
>> - break;
>> - nr = next_pidmap(ns, nr);
>> - } while (nr > 0);
>> -
>> - return pid;
>> + return idr_get_next(&ns->idr, &nr);
>> }
>>
>> /*
>> @@ -578,7 +452,7 @@ void __init pidhash_init(void)
>> 0, 4096);
>> }
>>
>> -void __init pidmap_init(void)
>> +void __init pid_idr_init(void)
>> {
>> /* Verify no one has done anything silly: */
>> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
>> @@ -590,10 +464,7 @@ void __init pidmap_init(void)
>> PIDS_PER_CPU_MIN * num_possible_cpus());
>> pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
>>
>> - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>> - /* Reserve PID 0. We never call free_pidmap(0) */
>> - set_bit(0, init_pid_ns.pidmap[0].page);
>> - atomic_dec(&init_pid_ns.pidmap[0].nr_free);
>> + idr_init(&init_pid_ns.idr);
>>
>> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
>> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>> index 4918314..5009dbe 100644
>> --- a/kernel/pid_namespace.c
>> +++ b/kernel/pid_namespace.c
>> @@ -21,6 +21,7 @@
>> #include <linux/export.h>
>> #include <linux/sched/task.h>
>> #include <linux/sched/signal.h>
>> +#include <linux/idr.h>
>>
>> struct pid_cache {
>> int nr_ids;
>> @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>> struct pid_namespace *ns;
>> unsigned int level = parent_pid_ns->level + 1;
>> struct ucounts *ucounts;
>> - int i;
>> int err;
>>
>> err = -EINVAL;
>> @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>> if (ns == NULL)
>> goto out_dec;
>>
>> - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>> - if (!ns->pidmap[0].page)
>> - goto out_free;
>> + idr_init(&ns->idr);
>>
>> ns->pid_cachep = create_pid_cachep(level + 1);
>> if (ns->pid_cachep == NULL)
>> - goto out_free_map;
>> + goto out_free_idr;
>>
>> err = ns_alloc_inum(&ns->ns);
>> if (err)
>> - goto out_free_map;
>> + goto out_free_idr;
>> ns->ns.ops = &pidns_operations;
>>
>> kref_init(&ns->kref);
>> @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>> ns->nr_hashed = PIDNS_HASH_ADDING;
>> INIT_WORK(&ns->proc_work, proc_cleanup_work);
>>
>> - set_bit(0, ns->pidmap[0].page);
>> - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
>> -
>> - for (i = 1; i < PIDMAP_ENTRIES; i++)
>> - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
>> -
>> return ns;
>>
>> -out_free_map:
>> - kfree(ns->pidmap[0].page);
>> -out_free:
>> +out_free_idr:
>> + idr_destroy(&ns->idr);
>> kmem_cache_free(pid_ns_cachep, ns);
>> out_dec:
>> dec_pid_namespaces(ucounts);
>> @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
>>
>> static void destroy_pid_namespace(struct pid_namespace *ns)
>> {
>> - int i;
>> -
>> ns_free_inum(&ns->ns);
>> - for (i = 0; i < PIDMAP_ENTRIES; i++)
>> - kfree(ns->pidmap[i].page);
>> +
>> + idr_destroy(&ns->idr);
>> call_rcu(&ns->rcu, delayed_free_pidns);
>> }
>>
>> @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>> int rc;
>> struct task_struct *task, *me = current;
>> int init_pids = thread_group_leader(me) ? 1 : 2;
>> + struct pid *pid;
>>
>> /* Don't allow any more processes into the pid namespace */
>> disable_pid_allocation(pid_ns);
>> @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>> * maintain a tasklist for each pid namespace.
>> *
>> */
>> + rcu_read_lock();
>> read_lock(&tasklist_lock);
>> - nr = next_pidmap(pid_ns, 1);
>> - while (nr > 0) {
>> - rcu_read_lock();
>> -
>> - task = pid_task(find_vpid(nr), PIDTYPE_PID);
>> + nr = 2;
>> + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
>> + task = pid_task(pid, PIDTYPE_PID);
>> if (task && !__fatal_signal_pending(task))
>> send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
>> -
>> - rcu_read_unlock();
>> -
>> - nr = next_pidmap(pid_ns, nr);
>> }
>> read_unlock(&tasklist_lock);
>> + rcu_read_unlock();
>>
>> /*
>> * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
>> @@ -311,7 +297,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>> * it should synchronize its usage with external means.
>> */
>>
>> - tmp.data = &pid_ns->last_pid;
>> + tmp.data = &pid_ns->idr.idr_next;
>> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>> }
>>