thread leader death under strace (was Re: [PATCH 03/10] ptrace:implement PTRACE_SEIZE)

From: Denys Vlasenko
Date: Thu Jun 02 2011 - 11:34:02 EST


On Thu, Jun 2, 2011 at 7:01 AM, Tejun Heo <tj@xxxxxxxxxx> wrote:
> Maybe I misunderstood the problem but wasn't the problem about not
> being able to wait for the exit of a leader thread and detach it?  We
> have reliable (sans exec but that's a different story) exit
> notification with EVENT_EXIT which even reports the exit_code, so I
> don't see what the problem is.  What am I missing?

The problem is that right now it seems that if tracer doesn't catch
EVENT_EXIT and detach tracee when it sees it, really weird things
happen.

Case 1: tracer traces thread leader. An untraced thread execs.
tracer sees EVENT_EXIT, PTRACE_CONTs tracee, and...
never sees WIFEXITED, waitpid just blocks forever!

Case 2: discovered when I started experiments with current kernel
behavior. No execve involved. I just run two tracees, make
leader exit, then make the other tracee signal itself with fatal signal
(SIGUSR1).
Tracer sees leader's exit, but never sees other tracee's signal!

Please see attached program.
The output on my F15 machine:

4816: thread leader
4816: status:0003057f WIFSTOPPED sig:5 (TRAP) event:CLONE eventdata:0x12d1
4817: status:0000137f WIFSTOPPED sig:19 (STOP) event:none eventdata:0x0
EXITING <=== leader will exit now. tracer sees it:
4816: status:0006057f WIFSTOPPED sig:5 (TRAP) event:EXIT eventdata:0x7700
DYING <=== other thread SIGUSR1's itself. tracer *doesn't see anything*
alarm timed out <=== tracer dies on alarm(1).

Now, if I send a non-fatal signal first, by uncommenting these lines:
// usleep(100*1000);
// VERBOSE("WINCH\n");
// raise(SIGWINCH);

Output:

4834: thread leader
4834: status:0003057f WIFSTOPPED sig:5 (TRAP) event:CLONE eventdata:0x12e3
4835: status:0000137f WIFSTOPPED sig:19 (STOP) event:none eventdata:0x0
EXITING <=== leader will exit now. tracer sees it:
4834: status:0006057f WIFSTOPPED sig:5 (TRAP) event:EXIT eventdata:0x7700
WINCH <=== other thread WINCH's itself. tracer *doesn't see anything*
DYING <=== other thread SIGUSR1's itself. tracer sees it:
4835: status:00000a7f WIFSTOPPED sig:10 (USR1) event:none eventdata:0x0
tracer sees other thread about to die from signal 10:
4835: status:0006057f WIFSTOPPED sig:5 (TRAP) event:EXIT eventdata:0xa
tracer sees other thread die from signal 10:
4835: status:0000000a WIFSIGNALED sig:10 (USR1)
tracer sees leader die from signal 10:
4834: status:0000000a WIFSIGNALED sig:10 (USR1)
tracer is informed "no more tracees":
waitpid returned -1

This is just plain broken, right?
(1) It should have worked in the first case too.
(2) Where is SIGWINCH notification? Why ptrace didn't report it?

Re (2): if I disable leader exit - these lines:
VERBOSE("EXITING\n");
syscall(__NR_exit, 0x77);

then I do see SIGWINCH notification:

4891: thread leader
4891: status:0003057f WIFSTOPPED sig:5 (TRAP) event:CLONE eventdata:0x131c
4892: status:0000137f WIFSTOPPED sig:19 (STOP) event:none eventdata:0x0
WINCH
4891: status:00001c7f WIFSTOPPED sig:28 ((null)) event:none eventdata:0x131c
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
DYING
4891: status:00000a7f WIFSTOPPED sig:10 (USR1) event:none eventdata:0x131c
4892: status:0006057f WIFSTOPPED sig:5 (TRAP) event:EXIT eventdata:0xa
4891: status:0006057f WIFSTOPPED sig:5 (TRAP) event:EXIT eventdata:0xa
4892: status:0000000a WIFSIGNALED sig:10 (USR1)
4891: status:0000000a WIFSIGNALED sig:10 (USR1)
waitpid returned -1

Looks like leader's exit throws a wrench into ptrace machinery.

Now I understand why strace DETACHs on exit! :)

--
vda
/* ...DESCRITION...

This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely. */

#define _GNU_SOURCE 1
#include <assert.h>
#include <limits.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
/* #include <pthread.h> */
/* Dance around ptrace.h + user.h incompatibility */
#ifdef __ia64__
# define ia64_fpreg ia64_fpreg_DISABLE
# define pt_all_user_regs pt_all_user_regs_DISABLE
#endif
#include <sys/ptrace.h>
#include <linux/ptrace.h>
#ifdef __ia64__
# undef ia64_fpreg
# undef pt_all_user_regs
#endif
#include <sys/user.h>
#if defined __i386__ || defined __x86_64__
# include <sys/debugreg.h>
#endif
/* Define clone2 for all arches */
#ifdef __ia64__
extern int __clone2(int (*fn) (void *), void *child_stack_base,
size_t stack_size, int flags, void *arg, ...);
#define clone2 __clone2
#else
#define clone2(func, stack_base, size, flags, arg...) \
clone(func, (stack_base) + (size), flags, arg)
#endif


static int verbose;

#define VERBOSE(...) do { \
if (verbose) { \
printf(__VA_ARGS__); fflush(stdout); \
} \
} while (0)

static pid_t child;
/*static pid_t grandchild;*/

static void
sigkill(pid_t *pp)
{
pid_t pid = *pp;
*pp = 0;
if (pid > 0)
kill(pid, SIGKILL);
}

static void
cleanup(void)
{
/*sigkill(&grandchild);*/
sigkill(&child);
while (waitpid(-1, NULL, __WALL) > 0)
continue;
}

static void
handler_fail(int signo)
{
VERBOSE("alarm timed out\n");
sigset_t set;
signal(SIGABRT, SIG_DFL);
signal(SIGALRM, SIG_DFL);
/* SIGALRM may be blocked in sighandler, need to unblock */
sigfillset(&set);
sigprocmask(SIG_UNBLOCK, &set, NULL);
/* Due to kernel bugs, waitpid may block. Need to have a timeout */
alarm(1);
cleanup();
exit(0); //assert(0);
}

static const char* sig_name(unsigned sig)
{
static const char *const sigs[] = {
[SIGSTOP] = "STOP", [SIGTRAP] = "TRAP", [SIGKILL] = "KILL",
[SIGTERM] = "TERM", [SIGINT ] = "INT" , [SIGALRM] = "ALRM",
[SIGABRT] = "ABRT", [SIGUSR1] = "USR1", [SIGUSR2] = "USR2",
[0 ] = "0",
[SIGTRAP|0x80] = "TRAP|80",
};
static const unsigned num_sigs = sizeof(sigs) / sizeof(sigs[0]);
if (sig < num_sigs)
return sigs[sig];
return NULL;
}

static const char* event_name(int status)
{
static const char *const events[] = {
[0] = "none",
[PTRACE_EVENT_FORK ] = "FORK",
[PTRACE_EVENT_VFORK ] = "VFORK",
[PTRACE_EVENT_CLONE ] = "CLONE",
[PTRACE_EVENT_EXEC ] = "EXEC",
[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
[PTRACE_EVENT_EXIT ] = "EXIT",
};
static const unsigned num_events = sizeof(events) / sizeof(events[0]);
status = (unsigned)status >> 16;
if (status < num_events)
return events[status];
return NULL;
}

/****************** Standard scaffolding ends here ****************/

/*
* Extended commentary of the entire test.
*
* What kernels / patches exhibit it? When it was fixed?
* Is it CPU vendor/model dependent? SMP dependent?
* Is it deterministic?
* How easy/hard is to reproduce it
* (always? a dozen loops? a second? minute? etc)
*/

/* If the test is not deterministic:
* Amount of seconds needed to almost 100% catch it */
//#define DEFAULT_TESTTIME 5
/* or (if reproducible in a few loops only) */
//#define DEFAULT_LOOPS 100

static int
thread1(void *unused)
{
// usleep(100*1000);
// VERBOSE("WINCH\n");
// raise(SIGWINCH);

usleep(100*1000);
VERBOSE("DYING\n");
raise(SIGUSR1);

for(;;) pause();
return 0;
}

static int
thread_leader(void *unused)
{
/* malloc gives sufficiently aligned buffer.
* long buf[] does not! (on ia64).
*/
clone2(thread1, malloc(16 * 1024), 16 * 1024, 0
| CLONE_VM
| CLONE_FS
| CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM
// | CLONE_PTRACE
| 0 /* no signal to send on death */
, NULL);
VERBOSE("EXITING\n");
syscall(__NR_exit, 0x77);
for(;;) pause();
return 0;
}

/* If nothing strange happens, just returns.
* Notable events (which are not bugs) print some sort of marker
* is verbose is on, but still continue and return normally.
* Known bugs also print a message if verbose, but they exit(1).
* New bugs are likely to trip asserts or cause hang/kernel crash :)
*/
static void
reproduce(void)
{
int status;
pid_t pid;

alarm(1);

/* Typical scenario starts like this. */
child = fork();
assert(child != -1);
if (child == 0) {
/* child */
errno = 0;
ptrace(PTRACE_TRACEME, 0, (void *) 0, (void *) 0);
assert_perror(errno);
raise(SIGSTOP);
assert_perror(errno);

printf("%d: thread leader\n", getpid());
thread_leader(NULL);
}

/* We are parent tracer */
assert(child > 0);
errno = 0;

/* Child has stopped itself, checking */
pid = waitpid(child, &status, 0);
assert(pid == child);
assert(WIFSTOPPED(status));
assert(WSTOPSIG(status) == SIGSTOP);

ptrace(PTRACE_SETOPTIONS, child, NULL, 0
| PTRACE_O_TRACESYSGOOD
| PTRACE_O_TRACECLONE
| PTRACE_O_TRACEEXIT
| PTRACE_O_TRACEEXEC
);
assert_perror(errno);

ptrace(PTRACE_CONT, child, NULL, (void *) 0);
assert_perror(errno);

/* Let's just look on the resulting sequence of events */
for (;;) {
pid_t pid = waitpid(-1, &status, __WALL);
if (pid <= 0) {
printf("waitpid returned %d\n", pid);
return;
}
if (WIFSTOPPED(status)) {
int sig = WSTOPSIG(status);

char event_str[sizeof(long)*3] = "<err>";
long event_data = 0x11111111;
int have_event = !ptrace(PTRACE_GETEVENTMSG, pid, 0L, &event_data);
if (have_event)
sprintf(event_str, "0x%lx", event_data);

printf("%d: status:%08x WIFSTOPPED sig:%d (%s) event:%s eventdata:%s\n",
pid, status,
sig, sig_name(sig),
event_name(status),
event_str
);
if (sig == SIGSTOP) sig = 0;
if (sig == SIGTRAP) sig = 0;
if (sig == (SIGTRAP|0x80)) sig = 0;
ptrace(PTRACE_CONT, pid, NULL, sig);
assert_perror(errno);
}
else if (WIFEXITED(status))
printf("%d: status:%08x WIFEXITED exitcode:%d\n",
pid, status, WEXITSTATUS(status));
else if (WIFSIGNALED(status))
printf("%d: status:%08x WIFSIGNALED sig:%d (%s)\n",
pid, status, WTERMSIG(status), sig_name(WTERMSIG(status)));
else
printf("%d: status:%08x - ???\n",
pid, status);
}

cleanup();
}

int
main(int argc, char **argv)
{
setbuf(stdout, NULL);

#if defined DEFAULT_TESTTIME || defined DEFAULT_LOOPS
int i;
char *env_testtime = getenv("TESTTIME"); /* misnomer */
int testtime = (env_testtime ? atoi(env_testtime) : 1);
#endif

atexit(cleanup);
signal(SIGINT, handler_fail);
signal(SIGABRT, handler_fail);
signal(SIGALRM, handler_fail);
verbose = 1 ;//(argc - 1);

#if defined DEFAULT_TESTTIME
testtime *= DEFAULT_TESTTIME;
for(i = 0; i < testtime; i++) {
time_t t = time(NULL);
while (t == time(NULL)) {
VERBOSE(".");
reproduce();
}
}
VERBOSE("\n");
#elif defined DEFAULT_LOOPS
testtime *= DEFAULT_LOOPS;
for(i = 0; i < testtime; i++) {
VERBOSE(".");
reproduce();
}
VERBOSE("\n");
#else
reproduce();
#endif

return 0;
}