Re: [PATCH v2 0/2] signalfd/epoll fixes

From: Linus Torvalds
Date: Fri Feb 24 2012 - 18:15:19 EST


On Fri, Feb 24, 2012 at 12:23 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> I'm *really* conflicted, because I have this really strong feeling
> that it's just papering over a symptom, and we damn well shouldn't be
> doing that. I really think that what we really should do is allow
> "poll()" to have a "poll_remove" callback (so each "add_poll_wait()"
> will have a callback when it gets remove).
>
> Then we could make the poll() functions actually do allocations and
> crap - or at least add refcounts - and the "poll_remove()" ones would
> undo them.

Ok, I have *NOT* tested this, and by now it's certainly not 3.3
material any more, but here's a patch to kind of lay the groundwork
and show what I mean.

UNTESTED! UNTESTED! CAVEAT! Probably horribly broken.

Linus
drivers/vhost/vhost.c | 5 +++--
fs/eventpoll.c | 9 ++++++++-
fs/select.c | 12 ++++++++----
fs/signalfd.c | 12 +++++++++++-
include/linux/fs.h | 1 +
include/linux/poll.h | 7 ++++---
kernel/cgroup.c | 3 ++-
net/9p/trans_fd.c | 5 +++--
virt/kvm/eventfd.c | 3 ++-
9 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b95ab8..5880f3d81333 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -42,14 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly;
#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])

-static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
- poll_table *pt)
+static int vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
+ poll_table *pt)
{
struct vhost_poll *poll;

poll = container_of(pt, struct vhost_poll, table);
poll->wqh = wqh;
add_wait_queue(wqh, &poll->wait);
+ return 1;
}

static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aabdfc38cf24..010b9eb9de74 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -474,13 +474,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
+ struct file *file = epi->ffd.file;
struct list_head *lsthead = &epi->pwqlist;
struct eppoll_entry *pwq;
+ void (*poll_rm) (struct file *, wait_queue_head_t *);

+ poll_rm = file->f_op->poll_rm;
while (!list_empty(lsthead)) {
pwq = list_first_entry(lsthead, struct eppoll_entry, llink);

list_del(&pwq->llink);
+ if (poll_rm)
+ poll_rm(file, pwq->whead);
remove_wait_queue(pwq->whead, &pwq->wait);
kmem_cache_free(pwq_cache, pwq);
}
@@ -903,7 +908,7 @@ out_unlock:
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+static int ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
@@ -916,9 +921,11 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
+ return 1;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
+ return 0;
}
}

diff --git a/fs/select.c b/fs/select.c
index e782258d0de3..a4e369dda85c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,7 +110,7 @@ struct poll_table_page {
* as all select/poll functions have to call it to add an entry to the
* poll table.
*/
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+static int __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
@@ -126,8 +126,11 @@ EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
{
+ struct file *file = entry->filp;
+ if (file->f_op->poll_rm)
+ file->f_op->poll_rm(file, entry->wait_address);
remove_wait_queue(entry->wait_address, &entry->wait);
- fput(entry->filp);
+ fput(file);
}

void poll_freewait(struct poll_wqueues *pwq)
@@ -213,13 +216,13 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
}

/* Add a new entry */
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+static int __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
- return;
+ return 0;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
@@ -227,6 +230,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
+ return 1;
}

int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 492465b451dd..89afafafbf2e 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -40,12 +40,21 @@ static int signalfd_release(struct inode *inode, struct file *file)
return 0;
}

+static void signalfd_poll_rm(struct file *file, wait_queue_head_t *p)
+{
+ struct sighand_struct *sighand;
+
+ sighand = container_of(p, struct sighand_struct, signalfd_wqh);
+ __cleanup_sighand(sighand);
+}
+
static unsigned int signalfd_poll(struct file *file, poll_table *wait)
{
struct signalfd_ctx *ctx = file->private_data;
unsigned int events = 0;

- poll_wait(file, &current->sighand->signalfd_wqh, wait);
+ if (poll_wait(file, &current->sighand->signalfd_wqh, wait))
+ atomic_inc(&current->sighand->count);

spin_lock_irq(&current->sighand->siglock);
if (next_signal(&current->pending, &ctx->sigmask) ||
@@ -215,6 +224,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
static const struct file_operations signalfd_fops = {
.release = signalfd_release,
.poll = signalfd_poll,
+ .poll_rm = signalfd_poll_rm,
.read = signalfd_read,
.llseek = noop_llseek,
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 69cd5bb640f5..a96e193dfa6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1604,6 +1604,7 @@ struct file_operations {
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ void (*poll_rm) (struct file *, wait_queue_head_t * wait_address);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index cf40010ce0cd..9b73a2557add 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -30,17 +30,18 @@ struct poll_table_struct;
/*
* structures and helpers for f_op->poll implementations
*/
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef int (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
poll_queue_proc qproc;
unsigned long key;
} poll_table;

-static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static inline int poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
- p->qproc(filp, wait_address, p);
+ return p->qproc(filp, wait_address, p);
+ return 0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..3029d66d8b51 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3520,7 +3520,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
return 0;
}

-static void cgroup_event_ptable_queue_proc(struct file *file,
+static int cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
@@ -3528,6 +3528,7 @@ static void cgroup_event_ptable_queue_proc(struct file *file,

event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
+ return 1;
}

/*
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index fccae26fa674..d87b30b6e8e9 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -538,7 +538,7 @@ static int p9_pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
* called by files poll operation to add v9fs-poll task to files wait queue
*/

-static void
+static int
p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
{
struct p9_conn *m = container_of(p, struct p9_conn, pt);
@@ -554,13 +554,14 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)

if (!pwait) {
p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n");
- return;
+ return 0;
}

pwait->conn = m;
pwait->wait_addr = wait_address;
init_waitqueue_func_entry(&pwait->wait, p9_pollwake);
add_wait_queue(wait_address, &pwait->wait);
+ return 1;
}

/**
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f59c1e8de7a2..f5e59036a937 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -168,12 +168,13 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
return 0;
}

-static void
+static int
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
add_wait_queue(wqh, &irqfd->wait);
+ return 1;
}

/* Must be called under irqfds.lock */