[PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone

From: Jann Horn
Date: Mon Nov 02 2020 - 15:46:16 EST


At the moment, the seccomp notifier API is hard to use without combining
it with APIs like poll() or epoll(); if all target processes have gone
away, the polling APIs will raise an error indication on the file
descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.

This usability problem was discovered when Michael Kerrisk wrote a
manpage for this API.

To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
behave as follows:

If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
immediately, no matter whether a notification is available or not.

If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
notification is delivered to userspace or all users of the filter have
gone away.

To avoid subtle breakage from eventloop-style code that doesn't set
O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
wants blocking behavior, and if blocking-style code forgets to do so,
that will be much more obvious than the breakage we'd get the other way
around.
This also means that UAPI breakage from this change should be limited to
blocking users of the API, of which, to my knowledge, there are none so far
(outside of in-tree sample and selftest code, which this patch adjusts - in
particular the code in samples/ has to change a bunch).

This should be backported because future userspace code might otherwise not
work properly on old kernels.

Cc: stable@xxxxxxxxxxxxxxx
Reported-by: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
kernel/seccomp.c | 62 +++++++++++++------
samples/seccomp/user-trap.c | 16 +++++
tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
3 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8ad7a293255a..b3730740515f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -43,6 +43,7 @@
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
#include <linux/lockdep.h>
+#include <linux/freezer.h>

/*
* When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
@@ -138,7 +139,6 @@ struct seccomp_kaddfd {
* @notifications: A list of struct seccomp_knotif elements.
*/
struct notification {
- struct semaphore request;
u64 next_id;
struct list_head notifications;
};
@@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
list_add(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);

- up(&match->notif->request);
wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
mutex_unlock(&match->notify_lock);

@@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)


static long seccomp_notify_recv(struct seccomp_filter *filter,
- void __user *buf)
+ void __user *buf, bool blocking)
{
struct seccomp_knotif *knotif = NULL, *cur;
+ DEFINE_WAIT(wait);
struct seccomp_notif unotif;
ssize_t ret;

@@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,

memset(&unotif, 0, sizeof(unotif));

- ret = down_interruptible(&filter->notif->request);
- if (ret < 0)
- return ret;
-
mutex_lock(&filter->notify_lock);
+
+retry:
list_for_each_entry(cur, &filter->notif->notifications, list) {
if (cur->state == SECCOMP_NOTIFY_INIT) {
knotif = cur;
@@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
}
}

- /*
- * If we didn't find a notification, it could be that the task was
- * interrupted by a fatal signal between the time we were woken and
- * when we were able to acquire the rw lock.
- */
if (!knotif) {
- ret = -ENOENT;
- goto out;
+ if (!blocking) {
+ if (refcount_read(&filter->users) == 0)
+ ret = -ENOTCONN;
+ else
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /* This has to happen before checking &filter->users. */
+ prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * If all users of the filter are gone, throw an error
+ * instead of pointlessly continuing to block.
+ */
+ if (refcount_read(&filter->users) == 0) {
+ finish_wait(&filter->wqh, &wait);
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ /* No notifications - wait for one... */
+ mutex_unlock(&filter->notify_lock);
+ freezable_schedule();
+ finish_wait(&filter->wqh, &wait);
+
+ /* ... and retry */
+ mutex_lock(&filter->notify_lock);
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ goto retry;
}

unotif.id = knotif->id;
@@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
*/
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
- if (knotif) {
+ if (knotif)
knotif->state = SECCOMP_NOTIFY_INIT;
- up(&filter->notif->request);
- }
mutex_unlock(&filter->notify_lock);
}

@@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
{
struct seccomp_filter *filter = file->private_data;
void __user *buf = (void __user *)arg;
+ bool blocking = !(file->f_flags & O_NONBLOCK);

/* Fixed-size ioctls */
switch (cmd) {
case SECCOMP_IOCTL_NOTIF_RECV:
- return seccomp_notify_recv(filter, buf);
+ return seccomp_notify_recv(filter, buf, blocking);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
@@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
if (!filter->notif)
goto out;

- sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);

ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
- filter, O_RDWR);
+ filter, O_RDWR|O_NONBLOCK);
if (IS_ERR(ret))
goto out_notif;

diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
index 20291ec6489f..b9e666f15998 100644
--- a/samples/seccomp/user-trap.c
+++ b/samples/seccomp/user-trap.c
@@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
return ret;
}

+static void set_blocking(int fd)
+{
+ int file_status_flags = fcntl(fd, F_GETFL);
+
+ file_status_flags &= ~O_NONBLOCK;
+ if (fcntl(fd, F_SETFL, file_status_flags)) {
+ perror("F_SETFL");
+ exit(1);
+ }
+}
+
int main(void)
{
int sk_pair[2], ret = 1, status, listener;
@@ -274,6 +285,8 @@ int main(void)
if (listener < 0)
goto out_kill;

+ set_blocking(listener);
+
/*
* Fork a task to handle the requests. This isn't strictly necessary,
* but it makes the particular writing of this sample easier, since we
@@ -307,6 +320,9 @@ int main(void)
while (1) {
memset(req, 0, sizes.seccomp_notif);
if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
+ if (errno == ENOTCONN)
+ exit(0);
+
perror("ioctl recv");
goto out_resp;
}
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 4a180439ee9e..5318f9cb1aec 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
#endif
}

+#define set_blocking(fd) ({ \
+ int file_status_flags; \
+ file_status_flags = fcntl(fd, F_GETFL); \
+ file_status_flags &= ~O_NONBLOCK; \
+ ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0); \
+})
+
/* Have TH_LOG report actual location filecmp() is used. */
#define filecmp(pid1, pid2, fd1, fd2) ({ \
int _ret; \
@@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
/*
* Check that nothing bad happens when we kill the task in the middle
* of a syscall.
@@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);

@@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);

@@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);

@@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);

@@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);

@@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);

+ set_blocking(listener);
+
pid = fork();
ASSERT_GE(pid, 0);


base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
--
2.29.1.341.ge80a0c044ae-goog