Re: [PATCH 01/23] tref: Implement task references.

From: Oleg Nesterov
Date: Sat Mar 04 2006 - 12:30:44 EST


Oleg Nesterov wrote:
>
> "Eric W. Biederman" wrote:
> >
> > Oleg Nesterov <oleg@xxxxxxxxxx> writes:
> >
> > > fastcall void free_pidmap(int pid)
> > > {
> > > pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
> > > int offset = pid & BITS_PER_PAGE_MASK;
> > > struct pid_ref *ref;
> > >
> > > clear_bit(offset, map->page);
> > > atomic_inc(&map->nr_free);
> > >
> > > ref = find_pid_ref(pid);
> > > if (unlikely(ref != NULL)) {
> > > hlist_del_init(&ref->chain);
> > > ref->pid = 0;
> > > }
> > > }
> >
> > Ouch! I believe free_pidmap now needs the tasklist_lock so
> > we can free the pid and kill the pid_ref atomically. Otherwise
> > the pid could potentially get reused before we free the pid reference.
> > I think that means ensuring all of the callers take tasklist_lock.
>
> Yes, you are right. And do_fork() does free_pidmap() lockless in
> the error path. This path is not performance critical, so may be
> it is ok to add wrie_lock(tasklist) here.

Even better: don't use tasklist_lock at all. We can use pidmap_lock
instead, see the patch below.

I have added a simple find_task_by_pid_ref() helper, note that it
doesn't need pidmap_lock, and it doesn't need to check ref->pid != 0.

If the caller does read_lock(tasklist), then this helper can't return
unhashed task_struct, otherwise it is possible anyway.

Oleg.

(for review only)

--- 2.6.16-rc5/include/linux/sched.h~1_REF 2006-03-01 22:00:30.000000000 +0300
+++ 2.6.16-rc5/include/linux/sched.h 2006-03-04 22:56:44.000000000 +0300
@@ -1012,8 +1012,6 @@ extern struct task_struct init_task;

extern struct mm_struct init_mm;

-#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr)
-extern struct task_struct *find_task_by_pid_type(int type, int pid);
extern void set_special_pids(pid_t session, pid_t pgrp);
extern void __set_special_pids(pid_t session, pid_t pgrp);

--- 2.6.16-rc5/include/linux/pid.h~1_REF 2006-03-01 22:00:29.000000000 +0300
+++ 2.6.16-rc5/include/linux/pid.h 2006-03-04 23:02:51.000000000 +0300
@@ -35,6 +35,9 @@ extern void FASTCALL(detach_pid(struct t
* held.
*/
extern struct pid *FASTCALL(find_pid(enum pid_type, int));
+extern struct task_struct *find_task_by_pid_type(int type, int pid);
+
+#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr)

extern int alloc_pidmap(void);
extern void FASTCALL(free_pidmap(int));
@@ -51,4 +54,23 @@ extern void FASTCALL(free_pidmap(int));
hlist_unhashed(&(task)->pids[type].pid_chain)); \
} \

+struct pid_ref
+{
+ pid_t pid;
+ int count;
+ struct hlist_node chain;
+};
+
+extern struct pid_ref *alloc_pid_ref(pid_t pid);
+extern void put_pid_ref(struct pid_ref *ref);
+
+static inline struct task_struct *find_task_by_pid_ref(struct pid_ref *ref,
+ enum pid_type type)
+{
+ if (!ref)
+ return NULL;
+
+ return find_task_by_pid_type(type, ref->pid);
+}
+
#endif /* _LINUX_PID_H */
--- 2.6.16-rc5/kernel/pid.c~1_REF 2006-03-01 22:03:25.000000000 +0300
+++ 2.6.16-rc5/kernel/pid.c 2006-03-04 22:27:51.000000000 +0300
@@ -28,9 +28,12 @@
#include <linux/hash.h>

#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash[PIDTYPE_MAX + 1];
static int pidhash_shift;

+#define ref_hashfn(pid) pid_hashfn(pid)
+#define ref_hash pid_hash[PIDTYPE_MAX]
+
int pid_max = PID_MAX_DEFAULT;
int last_pid;

@@ -62,13 +65,35 @@ static pidmap_t pidmap_array[PIDMAP_ENTR

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

+static struct pid_ref *find_pid_ref(pid_t pid)
+{
+ struct hlist_node *elem;
+ struct pid_ref *ref;
+
+ hlist_for_each_entry(ref, elem, &ref_hash[ref_hashfn(pid)], chain)
+ if (ref->pid == pid)
+ return ref;
+
+ return NULL;
+}
+
fastcall void free_pidmap(int pid)
{
pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
+ struct pid_ref *ref;
+ unsigned long flags;

clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
+
+ spin_lock_irqsave(&pidmap_lock, flags);
+ ref = find_pid_ref(pid);
+ if (unlikely(ref != NULL)) {
+ hlist_del_init(&ref->chain);
+ ref->pid = 0;
+ }
+ spin_unlock_irqrestore(&pidmap_lock, flags);
}

int alloc_pidmap(void)
@@ -217,6 +242,48 @@ task_t *find_task_by_pid_type(int type,

EXPORT_SYMBOL(find_task_by_pid_type);

+static inline int pid_inuse(pid_t pid)
+{
+ pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+ int offset = pid & BITS_PER_PAGE_MASK;
+
+ return likely(map->page) && test_bit(offset, map->page);
+}
+
+struct pid_ref *alloc_pid_ref(pid_t pid)
+{
+ struct pid_ref *ref;
+
+ spin_lock_irq(&pidmap_lock);
+ ref = find_pid_ref(pid);
+ if (ref)
+ ref->count++;
+ else if (pid_inuse(pid)) {
+ ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+ if (ref) {
+ ref->pid = pid;
+ ref->count = 1;
+ hlist_add_head(&ref->chain,
+ &ref_hash[ref_hashfn(pid)]);
+ }
+ }
+ spin_unlock_irq(&pidmap_lock);
+
+ return ref;
+}
+
+void free_pid_ref(struct pid_ref *ref)
+{
+ if (!ref)
+ return;
+
+ spin_lock_irq(&pidmap_lock);
+ if (!--ref->count) {
+ hlist_del_init(&ref->chain);
+ kfree(ref);
+ }
+ spin_lock_irq(&pidmap_lock);
+}
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -233,9 +300,9 @@ void __init pidhash_init(void)

printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
pidhash_size, pidhash_shift,
- PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+ (PIDTYPE_MAX + 1) * pidhash_size * sizeof(struct hlist_head));

- for (i = 0; i < PIDTYPE_MAX; i++) {
+ for (i = 0; i < PIDTYPE_MAX + 1; i++) {
pid_hash[i] = alloc_bootmem(pidhash_size *
sizeof(*(pid_hash[i])));
if (!pid_hash[i])
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/