[PATCH v7] do_wait: make PIDTYPE_PID case O(1) instead of O(n)

From: Jim Newsome
Date: Sun Mar 14 2021 - 19:23:24 EST


This patch adds a special-case when waiting on a pid (via waitpid,
waitid, wait4, etc) to avoid doing an O(n) scan of children and tracees,
and instead do an O(1) lookup. This improves performance when waiting on
a pid from a thread group with many children and/or tracees.

Time to fork and then call waitpid on the child, from a task that
already has N children [1]:

N | Before | After
-----|---------|------
1 | 74 us | 74 us
20 | 72 us | 75 us
100 | 83 us | 77 us
500 | 99 us | 74 us
1000 | 179 us | 75 us
5000 | 804 us | 79 us
8000 | 1268 us | 78 us

[1]: https://lkml.org/lkml/2021/3/12/1567

This can make a substantial performance improvement for applications
with a thread that has many children or tracees and frequently needs to
wait on them. Tools that use ptrace to intercept syscalls for a large
number of processes are likely to fall into this category. In particular
this patch was developed while building a ptrace-based second generation
of the Shadow emulator [2], for which it allows us to avoid quadratic
scaling (without having to use a workaround that introduces a ~40%
performance penalty) [3]. Other examples of tools that fall into this
category which this patch may help include User Mode Linux [4] and
DetTrace [5].

[2]: https://shadow.github.io/
[3]: https://github.com/shadow/shadow/issues/1134#issuecomment-798992292
[4]: https://en.wikipedia.org/wiki/User-mode_Linux
[5]: https://github.com/dettrace/dettrace

Signed-off-by: James Newsome <jnewsome@xxxxxxxxxxxxxx>
---

v5: https://lkml.org/lkml/2021/3/12/1134
v6 accidentally dropped a commit in the rebase + squash.

Since v5:
* Switched back to explicitly looking up by tgid and then pid.
* Added further motivation and context in the patch description.

kernel/exit.c | 67 +++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..65809fac3038 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1439,9 +1439,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
TASK_INTERRUPTIBLE, p);
}

+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+ struct task_struct *target)
+{
+ struct task_struct *parent =
+ !ptrace ? target->real_parent : target->parent;
+
+ return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+ same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+ bool ptrace;
+ struct task_struct *target;
+ int retval;
+
+ ptrace = false;
+ target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+ if (target && is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ ptrace = true;
+ target = pid_task(wo->wo_pid, PIDTYPE_PID);
+ if (target && target->ptrace &&
+ is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+
static long do_wait(struct wait_opts *wo)
{
- struct task_struct *tsk;
int retval;

trace_sched_process_wait(wo->wo_pid);
@@ -1463,19 +1502,27 @@ static long do_wait(struct wait_opts *wo)

set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
- retval = do_wait_thread(wo, tsk);
- if (retval)
- goto end;

- retval = ptrace_do_wait(wo, tsk);
+ if (wo->wo_type == PIDTYPE_PID) {
+ retval = do_wait_pid(wo);
if (retval)
goto end;
+ } else {
+ struct task_struct *tsk = current;
+
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ goto end;

- if (wo->wo_flags & __WNOTHREAD)
- break;
- } while_each_thread(current, tsk);
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);

notask:
--
2.30.1