[PATCH] Fix race condition when exec'ing setuid files

From: Jorge Merlino
Date: Sat Sep 10 2022 - 17:14:05 EST


This patch fixes a race condition in check_unsafe_exec when a heavily
threaded program tries to exec a setuid file. check_unsafe_exec counts the
number of threads sharing the same fs_struct and compares it to the total
number of users of the fs_struct by looking at its users counter. If there
are more users than process threads using it the setuid exec fails with
LSM_UNSAFE_SHARE. The problem is that, during the kernel_clone code
execution, the fs_struct users counter is incremented before the new thread
is added to the thread_group list. So there is a race when the counter has
been incremented but the thread is not visible to the while_each_tread loop
in check_unsafe_exec.

This patch sort of fixes this by setting a process flag to the parent
process during the time this race is possible. Thus, if a process is
forking, it counts an extra user fo the fs_struct as the counter might be
incremented before the thread is visible. But this is not great as this
could generate the opposite problem as there may be an external process
sharing the fs_struct that is masked by some thread that is being counted
twice. I submit this patch just as an idea but mainly I want to introduce
this issue and see if someone comes up with a better solution.

This is a simple code to reproduce this issue:

$ cat Makefile
ALL=a b
all: $(ALL)

a: LDFLAGS=-pthread

b: b.c
$(CC) b.c -o b
sudo chown root:root b
sudo chmod u+s b

test:
for I in $$(seq 1000); do echo $I; ./a ; done

clean:
rm -vf $(ALL)

$ cat a.c

void *nothing(void *p)
{
return NULL;
}

void *target(void *p) {
for (;;) {
pthread_t t;
if (pthread_create(&t, NULL, nothing, NULL) == 0)
pthread_join(t, NULL);
}
return NULL;
}

int main(void)
{
struct timespec tv;
int i;

for (i = 0; i < 10; i++) {
pthread_t t;
pthread_create(&t, NULL, target, NULL);
}
tv.tv_sec = 0;
tv.tv_nsec = 100000;
nanosleep(&tv, NULL);
if (execl("./b", "./b", NULL) < 0)
perror("execl");
return 0;
}

$ cat b.c

int main(void)
{
const uid_t euid = geteuid();
if (euid != 0) {
printf("Failed, got euid %d (expecting 0)\n", euid);
return 1;
}
return 0;
}

$ make
make
cc -pthread a.c -o a
cc b.c -o b
sudo chown root:root b
sudo chmod u+s b
$ make test

Without this fix, one will see 'Failed, got euid 1000 (expecting 0)' messages
---
fs/exec.c | 2 ++
include/linux/sched.h | 1 +
kernel/fork.c | 3 +++
3 files changed, 6 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 9a5ca7b82bfc..a6f949a899d5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1581,6 +1581,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
while_each_thread(p, t) {
if (t->fs == p->fs)
n_fs++;
+ if (t->flags & PF_IN_FORK)
+ n_fs++;
}
rcu_read_unlock();

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..f307165a434a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1722,6 +1722,7 @@ extern struct pid *cad_pid;
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
+#define PF_IN_FORK 0x02000000 /* Process is forking, prevents race condition on fs_struct users value */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a9e92068b15..54e1e1fbe0bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2245,6 +2245,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
+ current->flags |= PF_IN_FORK;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
@@ -2474,6 +2475,7 @@ static __latent_entropy struct task_struct *copy_process(
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
+ current->flags &= ~PF_IN_FORK;
total_forks++;
hlist_del_init(&delayed.node);
spin_unlock(&current->sighand->siglock);
@@ -2556,6 +2558,7 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock_irq(&current->sighand->siglock);
hlist_del_init(&delayed.node);
spin_unlock_irq(&current->sighand->siglock);
+ current->flags &= ~PF_IN_FORK;
return ERR_PTR(retval);
}

--
2.34.1