Re: core dumps from multi-threaded (kernel cloned) processes

Philip Gladstone (philip@raptor.com)
Thu, 20 Nov 1997 09:42:17 -0500


This is a multi-part message in MIME format.
--------------1EA90F5394623F5DFB7FBA27
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Attached are two patches. The first is a kernel patch for 2.0.32
that creates multithreaded core dumps. The second is a gdb-4.16 patch
that allows gdb to inspect these dumps.

There are the following problems with the linux patch:

1) It only works for i386, and will fail to compile on other
architectures.
This is feature -- actually I'm hoping that someone writes the helper
function for the other architectures.

2) It probably doesn't work well on SMP systems (I don't have any to
test
it on, and I'm a little unclear on the model)

3) The way that it works is truly horrible. It *may* deadlock the set of
crashing processes owing to the way that it uses the mmap_sem.

4) It doesn't dump the fpregs of any other than the crashing process.
This is also a feature as gdb won't look at them either!

The upside is that it works, and if you are doing anything with
multithreaded
processes, it is a big win. Note that you probably want to start out
with
a patched gdb-4.16 (from a suitable source RPM) otherwise you will
suffer
from the find_solib problem.

However, there is a little patch at the end of the linux-mt.pf (to
exit.c)
which allows gdb to attach to multithreaded processes -- well to a
single
thread/process. Doing anything better turns out to be rather complex --
especially as I don't understand the ptrace semantics for multithreaded
environments.

Philip

Jim Nance wrote:
>
> On Wed, Nov 19, 1997 at 09:46:31AM -0500, Philip Gladstone wrote:
> > Well, GDB can now attach to a thread(aka process), but only one
> > at once. To be honest, that was all that I needed. I'm going to take
> > a look at seeing if I can see how to make it support multiple
> > threads.
>
> I have read that there are GDB patches for SGI to make it work with
> threaded processes. Since SGIs threading model is close to Linuxes,
> it might be possible to port those. Actually they may be part of
> standard gdb for the SGI.
>
> > I'll send you the consolidated patches today.
>
> Thanks!
>
> Jim

-- 
Philip Gladstone                           +1 617 487 7700
Raptor Systems, Waltham, MA         http://www.raptor.com/
Our new daughter: http://www.mwmc.com/Extweb/Cybernursery/17423662.htm
--------------1EA90F5394623F5DFB7FBA27
Content-Type: text/plain; charset=us-ascii; name="gdb416.pf"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="gdb416.pf"

--- bfd/elfcore.h.old Sun Nov 16 18:56:41 1997 +++ gdb-4.16/bfd/elfcore.h Sun Nov 16 20:09:18 1997 @@ -22,7 +22,7 @@ #include <signal.h> #include <sys/procfs.h> #else -#define bfd_prstatus(abfd, descdata, descsz, filepos) true +#define bfd_prstatus(abfd, descdata, descsz, filepos, thrd) true #define bfd_fpregset(abfd, descdata, descsz, filepos) true #define bfd_prpsinfo(abfd, descdata, descsz, filepos) true #endif @@ -30,28 +30,35 @@ #ifdef HAVE_SYS_PROCFS_H static boolean -bfd_prstatus (abfd, descdata, descsz, filepos) +bfd_prstatus (abfd, descdata, descsz, filepos, thrd) bfd *abfd; char *descdata; int descsz; long filepos; + int thrd; { asection *newsect; prstatus_t *status = (prstatus_t *) 0; if (descsz == sizeof (prstatus_t)) { - newsect = bfd_make_section (abfd, ".reg"); + char sname[32]; + sprintf(sname, ".reg/%d", ((prstatus_t *) descdata)->pr_pid); + newsect = bfd_make_section (abfd, thrd > 0 ? strdup(sname) : ".reg"); if (newsect == NULL) return false; newsect->_raw_size = sizeof (status->pr_reg); newsect->filepos = filepos + (long) &status->pr_reg; newsect->flags = SEC_HAS_CONTENTS; newsect->alignment_power = 2; - if ((core_prstatus (abfd) = bfd_alloc (abfd, descsz)) != NULL) - { - memcpy (core_prstatus (abfd), descdata, descsz); - } + + if (thrd == 1) { + if ((core_prstatus (abfd) = bfd_alloc (abfd, descsz)) != NULL) + { + memcpy (core_prstatus (abfd), descdata, descsz); + } + return bfd_prstatus(abfd, descdata, descsz, filepos, 0); + } } return true; } @@ -237,6 +244,7 @@ char *sectname; /* Name to use for new section */ long filepos; /* File offset to descriptor data */ asection *newsect; + int thrd = 0; if (hdr->p_filesz > 0 && (buf = (char *) bfd_malloc ((size_t) hdr->p_filesz)) != NULL @@ -256,9 +264,9 @@ { case NT_PRSTATUS: /* process descdata as prstatus info */ - if (! bfd_prstatus (abfd, descdata, i_note.descsz, filepos)) + if (! bfd_prstatus (abfd, descdata, i_note.descsz, filepos, ++thrd)) return false; - sectname = ".prstatus"; + sectname = (thrd == 1) ? ".prstatus" : NULL; break; case NT_FPREGSET: /* process descdata as fpregset info */ --- gdb-4.16/gdb/main.c.orig Sat Apr 13 00:51:36 1996 +++ gdb-4.16/gdb/main.c Tue Nov 18 15:27:42 1997 @@ -112,6 +112,7 @@ /* This needs to happen before the first use of malloc. */ init_malloc ((PTR) NULL); + (void) calloc(1,1); #if defined (ALIGN_STACK_ON_STARTUP) i = (int) &count & 0x3;

--------------1EA90F5394623F5DFB7FBA27 Content-Type: text/plain; charset=us-ascii; name="linux-mt.pf" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="linux-mt.pf"

--- linux/arch/i386/kernel/ptrace.c.orig Tue Nov 18 10:39:22 1997 +++ linux/arch/i386/kernel/ptrace.c Tue Nov 18 11:06:57 1997 @@ -688,3 +688,9 @@ current->signal |= (1 << (current->exit_code - 1)); current->exit_code = 0; } + +void get_pt_regs_for_task(struct pt_regs *regs, struct task_struct *task) +{ + *regs = *(struct pt_regs *) (((unsigned char *) task->tss.esp0) - MAGICNUMBER); +} + --- linux/fs/binfmt_elf.c.orig Wed Oct 15 17:56:43 1997 +++ linux/fs/binfmt_elf.c Tue Nov 18 13:11:35 1997 @@ -987,20 +987,55 @@ struct elfhdr elf; off_t offset = 0, dataoff; int limit = current->rlim[RLIMIT_CORE].rlim_cur; - int numnote = 4; - struct memelfnote notes[4]; - struct elf_prstatus prstatus; /* NT_PRSTATUS */ + int numnote; + struct memelfnote *notes; + struct elf_prstatus *prstatus; /* NT_PRSTATUS */ elf_fpregset_t fpu; /* NT_PRFPREG */ struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ + int n_pids, this_pid; + struct task_struct *p; + + /* The very first thing is to grab the semaphore on the mmap */ + down(&current->mm->mmap_sem); - if (!current->dumpable || limit < PAGE_SIZE || current->mm->count != 1) + if (!current->dumpable || limit < PAGE_SIZE) { + up(&current->mm->mmap_sem); return 0; + } current->dumpable = 0; #ifndef CONFIG_BINFMT_ELF MOD_INC_USE_COUNT; #endif + /* See if we are alone in this process, or if other people + * share our mm + */ + n_pids = 0; + for_each_task(p) { + if (current->mm == p->mm) { + p->dumpable = 0; + n_pids++; + if (current != p) { + force_sig(signr, p); + } + } + } + + notes = (struct memelfnote *) kmalloc(sizeof(*notes) * (n_pids + 3), GFP_KERNEL); + if (!notes) { + up(&current->mm->mmap_sem); + goto exit_dump; + } + memset((char *) notes, 0, sizeof(*notes) * (n_pids + 3)); + prstatus = (struct elf_prstatus *) kmalloc(sizeof(*prstatus) * n_pids, GFP_KERNEL); + if (!prstatus) { + up(&current->mm->mmap_sem); + kfree(notes); + goto exit_dump; + } + memset((char *) prstatus, 0, sizeof(*prstatus) * n_pids); + /* Count what's needed to dump, up to the limit of coredump size */ segs = 0; size = 0; @@ -1044,72 +1079,41 @@ fs = get_fs(); set_fs(KERNEL_DS); - memcpy(corefile,"core.",5); -#if 0 - memcpy(corefile+5,current->comm,sizeof(current->comm)); -#else - corefile[4] = '\0'; -#endif - if (open_namei(corefile,O_CREAT | 2 | O_TRUNC,0600,&inode,NULL)) { - inode = NULL; - goto end_coredump; - } - if (!S_ISREG(inode->i_mode)) - goto end_coredump; - if (!inode->i_op || !inode->i_op->default_file_ops) - goto end_coredump; - file.f_mode = 3; - file.f_flags = 0; - file.f_count = 1; - file.f_inode = inode; - file.f_pos = 0; - file.f_reada = 0; - file.f_op = inode->i_op->default_file_ops; - if (file.f_op->open) - if (file.f_op->open(inode,&file)) - goto end_coredump; - if (!file.f_op->write) - goto close_coredump; - has_dumped = 1; - current->flags |= PF_DUMPCORE; - - DUMP_WRITE(&elf, sizeof(elf)); - offset += sizeof(elf); /* Elf header */ - offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. */ memset(&psinfo, 0, sizeof(psinfo)); - memset(&prstatus, 0, sizeof(prstatus)); + + numnote = 4; notes[0].name = "CORE"; notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(prstatus); - notes[0].data = &prstatus; - prstatus.pr_info.si_signo = prstatus.pr_cursig = signr; - prstatus.pr_sigpend = current->signal; - prstatus.pr_sighold = current->blocked; - psinfo.pr_pid = prstatus.pr_pid = current->pid; - psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid; - psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp; - psinfo.pr_sid = prstatus.pr_sid = current->session; - prstatus.pr_utime.tv_sec = CT_TO_SECS(current->utime); - prstatus.pr_utime.tv_usec = CT_TO_USECS(current->utime); - prstatus.pr_stime.tv_sec = CT_TO_SECS(current->stime); - prstatus.pr_stime.tv_usec = CT_TO_USECS(current->stime); - prstatus.pr_cutime.tv_sec = CT_TO_SECS(current->cutime); - prstatus.pr_cutime.tv_usec = CT_TO_USECS(current->cutime); - prstatus.pr_cstime.tv_sec = CT_TO_SECS(current->cstime); - prstatus.pr_cstime.tv_usec = CT_TO_USECS(current->cstime); + notes[0].datasz = sizeof(prstatus[0]); + notes[0].data = &prstatus[0]; + prstatus[0].pr_info.si_signo = prstatus[0].pr_cursig = signr; + prstatus[0].pr_sigpend = current->signal; + prstatus[0].pr_sighold = current->blocked; + psinfo.pr_pid = prstatus[0].pr_pid = current->pid; + psinfo.pr_ppid = prstatus[0].pr_ppid = current->p_pptr->pid; + psinfo.pr_pgrp = prstatus[0].pr_pgrp = current->pgrp; + psinfo.pr_sid = prstatus[0].pr_sid = current->session; + prstatus[0].pr_utime.tv_sec = CT_TO_SECS(current->utime); + prstatus[0].pr_utime.tv_usec = CT_TO_USECS(current->utime); + prstatus[0].pr_stime.tv_sec = CT_TO_SECS(current->stime); + prstatus[0].pr_stime.tv_usec = CT_TO_USECS(current->stime); + prstatus[0].pr_cutime.tv_sec = CT_TO_SECS(current->cutime); + prstatus[0].pr_cutime.tv_usec = CT_TO_USECS(current->cutime); + prstatus[0].pr_cstime.tv_sec = CT_TO_SECS(current->cstime); + prstatus[0].pr_cstime.tv_usec = CT_TO_USECS(current->cstime); /* * This transfers the registers from regs into the standard * coredump arrangement, whatever that is. */ #ifdef ELF_CORE_COPY_REGS - ELF_CORE_COPY_REGS(prstatus.pr_reg, regs) + ELF_CORE_COPY_REGS(prstatus[0].pr_reg, regs) #else if (sizeof(elf_gregset_t) != sizeof(struct pt_regs)) { @@ -1117,12 +1121,12 @@ sizeof(elf_gregset_t), sizeof(struct pt_regs)); } else - *(struct pt_regs *)&prstatus.pr_reg = *regs; + *(struct pt_regs *)&prstatus[0].pr_reg = *regs; #endif #ifdef DEBUG dump_regs("Passed in regs", (elf_greg_t *)regs); - dump_regs("prstatus regs", (elf_greg_t *)&prstatus.pr_reg); + dump_regs("prstatus regs", (elf_greg_t *)&prstatus[0].pr_reg); #endif notes[1].name = "CORE"; @@ -1160,8 +1164,8 @@ notes[2].data = current; /* Try to dump the fpu. */ - prstatus.pr_fpvalid = dump_fpu (regs, &fpu); - if (!prstatus.pr_fpvalid) + prstatus[0].pr_fpvalid = dump_fpu (regs, &fpu); + if (!prstatus[0].pr_fpvalid) { numnote--; } @@ -1172,7 +1176,93 @@ notes[3].datasz = sizeof(fpu); notes[3].data = &fpu; } + + this_pid = 1; + for_each_task(p) { + if (this_pid >= n_pids) + break; + + if (current->mm == p->mm && p != current) { + extern void get_pt_regs_for_task(struct pt_regs *, struct task_struct *p); + + notes[numnote].name = "CORE"; + notes[numnote].type = NT_PRSTATUS; + notes[numnote].datasz = sizeof(*prstatus); + notes[numnote].data = &prstatus[this_pid]; + prstatus[this_pid].pr_sigpend = p->signal; + prstatus[this_pid].pr_sighold = p->blocked; + prstatus[this_pid].pr_pid = p->pid; + prstatus[this_pid].pr_ppid = p->p_pptr->pid; + prstatus[this_pid].pr_pgrp = p->pgrp; + prstatus[this_pid].pr_sid = p->session; + prstatus[this_pid].pr_utime.tv_sec = CT_TO_SECS(p->utime); + prstatus[this_pid].pr_utime.tv_usec = CT_TO_USECS(p->utime); + prstatus[this_pid].pr_stime.tv_sec = CT_TO_SECS(p->stime); + prstatus[this_pid].pr_stime.tv_usec = CT_TO_USECS(p->stime); + prstatus[this_pid].pr_cutime.tv_sec = CT_TO_SECS(p->cutime); + prstatus[this_pid].pr_cutime.tv_usec = CT_TO_USECS(p->cutime); + prstatus[this_pid].pr_cstime.tv_sec = CT_TO_SECS(p->cstime); + prstatus[this_pid].pr_cstime.tv_usec = CT_TO_USECS(p->cstime); + + /* + * This transfers the registers from regs into the standard + * coredump arrangement, whatever that is. + */ +#ifdef ELF_CORE_COPY_REGS + { + struct pt_regs pregs; + get_pt_regs_for_task(&pregs, p); + ELF_CORE_COPY_REGS(prstatus[this_pid].pr_reg, &pregs) + } +#else + if (sizeof(elf_gregset_t) != sizeof(struct pt_regs)) + { + printk("sizeof(elf_gregset_t) (%d) != sizeof(struct pt_regs) (%d)\n", + sizeof(elf_gregset_t), sizeof(struct pt_regs)); + } + else + get_pt_regs_for_task((struct pt_regs *)&prstatus[this_pid].pr_reg, p); +#endif + numnote++; + this_pid++; + } + } + + up(&current->mm->mmap_sem); + memcpy(corefile,"core.",5); +#if 0 + memcpy(corefile+5,current->comm,sizeof(current->comm)); +#else + corefile[4] = '\0'; +#endif + if (open_namei(corefile,O_CREAT | 2 | O_TRUNC,0600,&inode,NULL)) { + inode = NULL; + goto end_coredump; + } + if (!S_ISREG(inode->i_mode)) + goto end_coredump; + if (!inode->i_op || !inode->i_op->default_file_ops) + goto end_coredump; + file.f_mode = 3; + file.f_flags = 0; + file.f_count = 1; + file.f_inode = inode; + file.f_pos = 0; + file.f_reada = 0; + file.f_op = inode->i_op->default_file_ops; + if (file.f_op->open) + if (file.f_op->open(inode,&file)) + goto end_coredump; + if (!file.f_op->write) + goto close_coredump; + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + DUMP_WRITE(&elf, sizeof(elf)); + offset += sizeof(elf); /* Elf header */ + offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + /* Write notes phdr entry */ { struct elf_phdr phdr; @@ -1258,6 +1348,11 @@ end_coredump: set_fs(fs); iput(inode); + + kfree(prstatus); + kfree(notes); + + exit_dump: #ifndef CONFIG_BINFMT_ELF MOD_DEC_USE_COUNT; #endif --- linux/kernel/exit.c.orig Wed Nov 19 16:33:22 1997 +++ linux/kernel/exit.c Wed Nov 19 16:36:38 1997 @@ -639,9 +639,14 @@ if (p->pgrp != -pid) continue; } - /* wait for cloned processes iff the __WCLONE flag is set */ - if ((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) - continue; + /* If you are tracing a process, then you don't need to get the + * WCLONE bit right -- useful for strace and gdb + */ + if (!(p->flags & (PF_PTRACED|PF_TRACESYS))) { + /* wait for cloned processes iff the __WCLONE flag is set */ + if ((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + continue; + } flag = 1; switch (p->state) { case TASK_STOPPED:

--------------1EA90F5394623F5DFB7FBA27--