[PATCH] wait4() WIFSTOPPED starvation fix #1/2

From: David Howells (dhowells@redhat.com)
Date: Fri Mar 15 2002 - 16:09:20 EST


Hi Linus,

There appears to be a starvation bug in sys_wait4().

The case runs like this:

 (1) If a process has a lot of threads, and a debugger like strace is running
     over all the threads, then the threads will be sat in the stopped more
     often than they're running.

 (2) the children being traced are "reparented" and added to strace's children
     list.

 (4) strace uses wait4() to find an _unspecified_ stopped (T) or zombie (Z)
     state process for it to deal with.

 (5) sys_wait4() always walks the list of children in the same order
     (effectively fork/clone order).

 (6) children at the end of the list starve for attention because sys_wait4()
     always takes the first process on the list that needs attention, and the
     list isn't ever rearranged.

This patch (#1) just converts the task_struct to use struct list_head rather
than direct pointers for maintaining the children list.

David

diff -uNr linux-2.5.6-pre1/arch/i386/kernel/signal.c linux-wait-256p1/arch/i386/kernel/signal.c
--- linux-2.5.6-pre1/arch/i386/kernel/signal.c Wed Feb 27 08:27:56 2002
+++ linux-wait-256p1/arch/i386/kernel/signal.c Fri Mar 15 19:10:03 2002
@@ -630,8 +630,8 @@
                                 info.si_signo = signr;
                                 info.si_errno = 0;
                                 info.si_code = SI_USER;
- info.si_pid = current->p_pptr->pid;
- info.si_uid = current->p_pptr->uid;
+ info.si_pid = current->parent->pid;
+ info.si_uid = current->parent->uid;
                         }
 
                         /* If the (new) signal is now blocked, requeue it. */
@@ -670,7 +670,7 @@
                         case SIGSTOP: {
                                 struct signal_struct *sig;
                                 current->exit_code = signr;
- sig = current->p_pptr->sig;
+ sig = current->parent->sig;
                                 preempt_disable();
                                 current->state = TASK_STOPPED;
                                 if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
diff -uNr linux-2.5.6-pre1/fs/binfmt_elf.c linux-wait-256p1/fs/binfmt_elf.c
--- linux-2.5.6-pre1/fs/binfmt_elf.c Wed Feb 27 08:26:56 2002
+++ linux-wait-256p1/fs/binfmt_elf.c Fri Mar 15 19:11:37 2002
@@ -1094,7 +1094,7 @@
         prstatus.pr_sigpend = current->pending.signal.sig[0];
         prstatus.pr_sighold = current->blocked.sig[0];
         psinfo.pr_pid = prstatus.pr_pid = current->pid;
- psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid;
+ psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid;
         psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp;
         psinfo.pr_sid = prstatus.pr_sid = current->session;
         prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime);
diff -uNr linux-2.5.6-pre1/fs/proc/array.c linux-wait-256p1/fs/proc/array.c
--- linux-2.5.6-pre1/fs/proc/array.c Wed Feb 27 08:26:55 2002
+++ linux-wait-256p1/fs/proc/array.c Fri Mar 15 19:09:08 2002
@@ -159,7 +159,7 @@
                 "Uid:\t%d\t%d\t%d\t%d\n"
                 "Gid:\t%d\t%d\t%d\t%d\n",
                 get_task_state(p), p->tgid,
- p->pid, p->pid ? p->p_opptr->pid : 0, 0,
+ p->pid, p->pid ? p->real_parent->pid : 0, 0,
                 p->uid, p->euid, p->suid, p->fsuid,
                 p->gid, p->egid, p->sgid, p->fsgid);
         read_unlock(&tasklist_lock);
@@ -340,7 +340,7 @@
         nice = task_nice(task);
 
         read_lock(&tasklist_lock);
- ppid = task->pid ? task->p_opptr->pid : 0;
+ ppid = task->pid ? task->real_parent->pid : 0;
         read_unlock(&tasklist_lock);
         res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
diff -uNr linux-2.5.6-pre1/fs/proc/base.c linux-wait-256p1/fs/proc/base.c
--- linux-2.5.6-pre1/fs/proc/base.c Wed Feb 27 08:26:55 2002
+++ linux-wait-256p1/fs/proc/base.c Fri Mar 15 19:09:19 2002
@@ -336,7 +336,7 @@
 };
 
 #define MAY_PTRACE(p) \
-(p==current||(p->p_pptr==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED))
+(p==current||(p->parent==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED))
 
 
 static int mem_open(struct inode* inode, struct file* file)
diff -uNr linux-2.5.6-pre1/include/linux/init_task.h linux-wait-256p1/include/linux/init_task.h
--- linux-2.5.6-pre1/include/linux/init_task.h Wed Feb 27 08:27:00 2002
+++ linux-wait-256p1/include/linux/init_task.h Fri Mar 15 18:58:27 2002
@@ -55,8 +55,10 @@
     time_slice: HZ, \
     next_task: &tsk, \
     prev_task: &tsk, \
- p_opptr: &tsk, \
- p_pptr: &tsk, \
+ real_parent: &tsk, \
+ parent: &tsk, \
+ children: LIST_HEAD_INIT(tsk.children), \
+ sibling: LIST_HEAD_INIT(tsk.sibling), \
     thread_group: LIST_HEAD_INIT(tsk.thread_group), \
     wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
     real_timer: { \
diff -uNr linux-2.5.6-pre1/include/linux/sched.h linux-wait-256p1/include/linux/sched.h
--- linux-2.5.6-pre1/include/linux/sched.h Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/include/linux/sched.h Fri Mar 15 18:52:29 2002
@@ -274,9 +274,12 @@
         /*
          * pointers to (original) parent process, youngest child, younger sibling,
          * older sibling, respectively. (p->father can be replaced with
- * p->p_pptr->pid)
+ * p->parent->pid)
          */
- struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+ struct task_struct *real_parent; /* real parent process (when being debugged) */
+ struct task_struct *parent; /* parent process */
+ struct list_head children; /* list of my children */
+ struct list_head sibling; /* linkage in my parent's children list */
         struct list_head thread_group;
 
         /* PID hash table linkage. */
@@ -715,28 +718,44 @@
         __ret; \
 })
 
-#define REMOVE_LINKS(p) do { \
- (p)->next_task->prev_task = (p)->prev_task; \
- (p)->prev_task->next_task = (p)->next_task; \
- if ((p)->p_osptr) \
- (p)->p_osptr->p_ysptr = (p)->p_ysptr; \
- if ((p)->p_ysptr) \
- (p)->p_ysptr->p_osptr = (p)->p_osptr; \
- else \
- (p)->p_pptr->p_cptr = (p)->p_osptr; \
+#define REMOVE_LINKS(p) do { \
+ (p)->next_task->prev_task = (p)->prev_task; \
+ (p)->prev_task->next_task = (p)->next_task; \
+ list_del_init(&(p)->sibling); \
         } while (0)
 
-#define SET_LINKS(p) do { \
- (p)->next_task = &init_task; \
- (p)->prev_task = init_task.prev_task; \
- init_task.prev_task->next_task = (p); \
- init_task.prev_task = (p); \
- (p)->p_ysptr = NULL; \
- if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
- (p)->p_osptr->p_ysptr = p; \
- (p)->p_pptr->p_cptr = p; \
+#define SET_LINKS(p) do { \
+ (p)->next_task = &init_task; \
+ (p)->prev_task = init_task.prev_task; \
+ init_task.prev_task->next_task = (p); \
+ init_task.prev_task = (p); \
+ list_add_tail(&(p)->sibling,&(p)->parent->children); \
         } while (0)
 
+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+ if (list_empty(&p->children)) return NULL;
+ return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *youngest_child(struct task_struct *p)
+{
+ if (list_empty(&p->children)) return NULL;
+ return list_entry(p->children.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+ if (p->sibling.prev==&p->parent->children) return NULL;
+ return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+ if (p->sibling.next==&p->parent->children) return NULL;
+ return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
 #define for_each_task(p) \
         for (p = &init_task ; (p = p->next_task) != &init_task ; )
 
diff -uNr linux-2.5.6-pre1/kernel/exit.c linux-wait-256p1/kernel/exit.c
--- linux-2.5.6-pre1/kernel/exit.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/exit.c Fri Mar 15 19:05:47 2002
@@ -91,10 +91,10 @@
         for_each_task(p) {
                 if ((p == ignored_task) || (p->pgrp != pgrp) ||
                     (p->state == TASK_ZOMBIE) ||
- (p->p_pptr->pid == 1))
+ (p->parent->pid == 1))
                         continue;
- if ((p->p_pptr->pgrp != pgrp) &&
- (p->p_pptr->session == p->session)) {
+ if ((p->parent->pgrp != pgrp) &&
+ (p->parent->session == p->session)) {
                         read_unlock(&tasklist_lock);
                          return 0;
                 }
@@ -144,8 +144,8 @@
 
         /* Reparent to init */
         REMOVE_LINKS(current);
- current->p_pptr = child_reaper;
- current->p_opptr = child_reaper;
+ current->parent = child_reaper;
+ current->real_parent = child_reaper;
         SET_LINKS(current);
 
         /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -217,16 +217,16 @@
                 reaper = child_reaper;
 
         for_each_task(p) {
- if (p->p_opptr == father) {
+ if (p->real_parent == father) {
                         /* We dont want people slaying init */
                         p->exit_signal = SIGCHLD;
                         p->self_exec_id++;
 
                         /* Make sure we're not reparenting to ourselves */
                         if (p == reaper)
- p->p_opptr = child_reaper;
+ p->real_parent = child_reaper;
                         else
- p->p_opptr = reaper;
+ p->real_parent = reaper;
 
                         if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
                 }
@@ -400,7 +400,7 @@
          * is about to become orphaned.
          */
          
- t = current->p_pptr;
+ t = current->parent;
         
         if ((t->pgrp != current->pgrp) &&
             (t->session == current->session) &&
@@ -445,17 +445,12 @@
         write_lock_irq(&tasklist_lock);
         current->state = TASK_ZOMBIE;
         do_notify_parent(current, current->exit_signal);
- while (current->p_cptr != NULL) {
- p = current->p_cptr;
- current->p_cptr = p->p_osptr;
- p->p_ysptr = NULL;
+ while ((p = eldest_child(current))) {
+ list_del_init(&p->sibling);
                 p->ptrace = 0;
 
- p->p_pptr = p->p_opptr;
- p->p_osptr = p->p_pptr->p_cptr;
- if (p->p_osptr)
- p->p_osptr->p_ysptr = p;
- p->p_pptr->p_cptr = p;
+ p->parent = p->real_parent;
+ list_add_tail(&p->sibling,&p->parent->children);
                 if (p->state == TASK_ZOMBIE)
                         do_notify_parent(p, p->exit_signal);
                 /*
@@ -568,7 +563,9 @@
         tsk = current;
         do {
                 struct task_struct *p;
- for (p = tsk->p_cptr ; p ; p = p->p_osptr) {
+ struct list_head *_p;
+ list_for_each(_p,&tsk->children) {
+ p = list_entry(_p,struct task_struct,sibling);
                         if (pid>0) {
                                 if (p->pid != pid)
                                         continue;
@@ -613,10 +610,10 @@
                                 if (retval)
                                         goto end_wait4;
                                 retval = p->pid;
- if (p->p_opptr != p->p_pptr) {
+ if (p->real_parent != p->parent) {
                                         write_lock_irq(&tasklist_lock);
                                         REMOVE_LINKS(p);
- p->p_pptr = p->p_opptr;
+ p->parent = p->real_parent;
                                         SET_LINKS(p);
                                         do_notify_parent(p, SIGCHLD);
                                         write_unlock_irq(&tasklist_lock);
diff -uNr linux-2.5.6-pre1/kernel/fork.c linux-wait-256p1/kernel/fork.c
--- linux-2.5.6-pre1/kernel/fork.c Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/kernel/fork.c Fri Mar 15 18:53:10 2002
@@ -666,7 +666,8 @@
 
         INIT_LIST_HEAD(&p->run_list);
 
- p->p_cptr = NULL;
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
         init_waitqueue_head(&p->wait_chldexit);
         p->vfork_done = NULL;
         if (clone_flags & CLONE_VFORK) {
@@ -766,12 +767,12 @@
         write_lock_irq(&tasklist_lock);
 
         /* CLONE_PARENT re-uses the old parent */
- p->p_opptr = current->p_opptr;
- p->p_pptr = current->p_pptr;
+ p->real_parent = current->real_parent;
+ p->parent = current->parent;
         if (!(clone_flags & CLONE_PARENT)) {
- p->p_opptr = current;
+ p->real_parent = current;
                 if (!(p->ptrace & PT_PTRACED))
- p->p_pptr = current;
+ p->parent = current;
         }
 
         if (clone_flags & CLONE_THREAD) {
diff -uNr linux-2.5.6-pre1/kernel/ptrace.c linux-wait-256p1/kernel/ptrace.c
--- linux-2.5.6-pre1/kernel/ptrace.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/ptrace.c Fri Mar 15 19:06:07 2002
@@ -24,7 +24,7 @@
         if (!(child->ptrace & PT_PTRACED))
                 return -ESRCH;
 
- if (child->p_pptr != current)
+ if (child->parent != current)
                 return -ESRCH;
 
         if (!kill) {
@@ -70,9 +70,9 @@
         task_unlock(task);
 
         write_lock_irq(&tasklist_lock);
- if (task->p_pptr != current) {
+ if (task->parent != current) {
                 REMOVE_LINKS(task);
- task->p_pptr = current;
+ task->parent = current;
                 SET_LINKS(task);
         }
         write_unlock_irq(&tasklist_lock);
@@ -98,7 +98,7 @@
         child->exit_code = data;
         write_lock_irq(&tasklist_lock);
         REMOVE_LINKS(child);
- child->p_pptr = child->p_opptr;
+ child->parent = child->real_parent;
         SET_LINKS(child);
         write_unlock_irq(&tasklist_lock);
 
diff -uNr linux-2.5.6-pre1/kernel/sched.c linux-wait-256p1/kernel/sched.c
--- linux-2.5.6-pre1/kernel/sched.c Wed Feb 27 08:29:20 2002
+++ linux-wait-256p1/kernel/sched.c Fri Mar 15 18:40:27 2002
@@ -1311,6 +1311,7 @@
 static void show_task(task_t * p)
 {
         unsigned long free = 0;
+ task_t *relative;
         int state;
         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
 
@@ -1337,17 +1338,17 @@
                         n++;
                 free = (unsigned long) n - (unsigned long)(p+1);
         }
- printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
- if (p->p_cptr)
- printk("%5d ", p->p_cptr->pid);
+ printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+ if ((relative = eldest_child(p)))
+ printk("%5d ", relative->pid);
         else
                 printk(" ");
- if (p->p_ysptr)
- printk("%7d", p->p_ysptr->pid);
+ if ((relative = younger_sibling(p)))
+ printk("%7d", relative->pid);
         else
                 printk(" ");
- if (p->p_osptr)
- printk(" %5d", p->p_osptr->pid);
+ if ((relative = older_sibling(p)))
+ printk(" %5d", relative->pid);
         else
                 printk(" ");
         if (!p->mm)
diff -uNr linux-2.5.6-pre1/kernel/signal.c linux-wait-256p1/kernel/signal.c
--- linux-2.5.6-pre1/kernel/signal.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/signal.c Fri Mar 15 19:06:50 2002
@@ -804,8 +804,8 @@
         info.si_code = why;
         info.si_status = status;
 
- send_sig_info(sig, &info, tsk->p_pptr);
- wake_up_parent(tsk->p_pptr);
+ send_sig_info(sig, &info, tsk->parent);
+ wake_up_parent(tsk->parent);
 }
 
 
diff -uNr linux-2.5.6-pre1/kernel/sys.c linux-wait-256p1/kernel/sys.c
--- linux-2.5.6-pre1/kernel/sys.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/sys.c Fri Mar 15 19:07:03 2002
@@ -839,7 +839,7 @@
         if (!p)
                 goto out;
 
- if (p->p_pptr == current || p->p_opptr == current) {
+ if (p->parent == current || p->real_parent == current) {
                 err = -EPERM;
                 if (p->session != current->session)
                         goto out;
diff -uNr linux-2.5.6-pre1/kernel/timer.c linux-wait-256p1/kernel/timer.c
--- linux-2.5.6-pre1/kernel/timer.c Wed Feb 27 08:26:59 2002
+++ linux-wait-256p1/kernel/timer.c Fri Mar 15 19:06:19 2002
@@ -742,14 +742,14 @@
         struct task_struct * me = current;
         struct task_struct * parent;
 
- parent = me->p_opptr;
+ parent = me->real_parent;
         for (;;) {
                 pid = parent->pid;
 #if CONFIG_SMP
 {
                 struct task_struct *old = parent;
                 mb();
- parent = me->p_opptr;
+ parent = me->real_parent;
                 if (old != parent)
                         continue;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Fri Mar 15 2002 - 22:00:21 EST