/* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include /* for kmalloc() */ #include /* These mutexes are a very simple counter: the winner is the one who decrements from 1 to 0. The counter starts at 1 when the lock is free. A value other than 0 or 1 means someone may be sleeping. This is simple enough to work on all architectures, but has the problem that if we never "up" the semaphore it could eventually wrap around. */ /* FIXME: This may be way too small. --RR */ #define FUTEX_HASHBITS 6 /* We use this instead of a normal wait_queue_t, so we can wake only the relevent ones (hashed queues may be shared) */ struct futex_q { struct list_head list; /* Page struct and offset within it. */ struct page *page; unsigned int offset; struct task_struct *task; pid_t tgid; /* "0" if synchronous futex */ void *uaddr; /* async wait on */ int sig; /* signal to wakeup */ }; /* The key for the hash is the address + index + offset within page */ static struct list_head futex_queues[1<tgid = 0; q->task = current; q->page = page; q->offset = offset; spin_lock(&futex_lock); list_add_tail(&q->list, head); spin_unlock(&futex_lock); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static inline int unqueue_me(struct futex_q *q) { int ret = 0; spin_lock(&futex_lock); if (!list_empty(&q->list)) { list_del(&q->list); ret = 1; } spin_unlock(&futex_lock); return ret; } /* Add at end to avoid starvation */ static inline void queue_me_async(struct list_head *head, struct futex_q *q, struct page *page, unsigned int offset, void *uaddr, int sig) { q->tgid = current->tgid; q->task = current; q->page = page; q->offset = offset; q->uaddr = uaddr; q->sig = sig; spin_lock(&futex_lock); list_add_tail(&q->list, head); spin_unlock(&futex_lock); } /* Return 1 if we were still queued in wait queue and not in notify queue */ static inline int unqueue_me_async(struct futex_q *q) { int ret = 0; spin_lock(&futex_lock); if ((q->page != NULL) && (!list_empty(&q->list))) { list_del(&q->list); ret = 1; } spin_unlock(&futex_lock); return ret; } static int futex_wake(struct list_head *head, struct page *page, unsigned int offset, int num) { extern int sys_kill(int,int); struct list_head *i, *next; int num_woken = 0; spin_lock(&futex_lock); list_for_each_safe(i, next, head) { struct futex_q *this = list_entry(i, struct futex_q, list); if (this->page == page && this->offset == offset) { list_del_init(i); if (this->tgid == 0) { /* synchronous */ wake_up_process(this->task); } else { /* move to notification queue, release page */ list_add_tail(&this->list, ¬ify_queue); put_page(this->page); this->page = NULL; if (sys_kill(this->tgid,this->sig)) { /* target is dead */ list_del(&this->list); kfree(this); continue; } } num_woken++; if (num_woken >= num) break; } } spin_unlock(&futex_lock); return num_woken; } /* Get kernel address of the user page and pin it. */ static struct page *pin_page(unsigned long page_start) { struct mm_struct *mm = current->mm; struct page *page; int err; down_read(&mm->mmap_sem); err = get_user_pages(current, current->mm, page_start, 1 /* one page */, 1 /* writable */, 0 /* don't force */, &page, NULL /* don't return vmas */); up_read(&mm->mmap_sem); if (err < 0) return ERR_PTR(err); return page; } static int futex_wait(struct list_head *head, struct page *page, int offset, int val, unsigned long time) { int *count; struct futex_q q; int ret = 0; set_current_state(TASK_INTERRUPTIBLE); queue_me(head, &q, page, offset); count = kmap(page) + offset; if (*count != val) { ret = -EWOULDBLOCK; set_current_state(TASK_RUNNING); kunmap(page); goto out; } kunmap(page); time = schedule_timeout(time); if (time == 0) { ret = -ETIMEDOUT; goto out; } if (signal_pending(current)) { ret = -EINTR; goto out; } out: /* Were we woken up anyway? */ if (!unqueue_me(&q)) return 0; return ret; } static int futex_await(struct list_head *head, struct page *page, int offset, int val, void *uaddr, int sig) { int *count; struct futex_q *q = kmalloc(sizeof(struct futex_q),GFP_KERNEL); int ret = 0; set_current_state(TASK_INTERRUPTIBLE); queue_me_async(head, q, page, offset, uaddr, sig); count = kmap(page) + offset; if (*count != val) { set_current_state(TASK_RUNNING); if (unqueue_me_async(q)) { kfree(q); ret = -EAGAIN; } } kunmap(page); return(ret); } static int futex_awaiters(void **uaddr, int num) { struct list_head *i, *next; int num_woken = 0; int rc; spin_lock(&futex_lock); list_for_each_safe(i, next, ¬ify_queue) { struct futex_q *this = list_entry(i, struct futex_q, list); if (this->tgid == current->tgid) { if (num_woken >= num) goto out; if ((rc = put_user(this->uaddr,&uaddr[num_woken]))) { /* all notifications selected sofar will be lost */ /* we could recreate them from **uaddr */ num_woken = rc; break; } list_del(i); kfree(this); num_woken++; } } out: spin_unlock(&futex_lock); return num_woken; } /* cleanup called from do_exit */ void __exit_futex(struct task_struct *task) { struct list_head *i, *next; spin_lock(&futex_lock); list_for_each_safe(i, next, ¬ify_queue) { struct futex_q *this = list_entry(i, struct futex_q, list); if (this->tgid == task->tgid) { list_del(i); kfree(this); } } spin_unlock(&futex_lock); } asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime) { int ret; unsigned long pos_in_page; struct list_head *head; struct page *page; unsigned long time; /* first handle the special case commands */ if (op == FUTEX_AWAKERS) { return futex_awaiters((void**)uaddr, val); } time = MAX_SCHEDULE_TIMEOUT; if (utime) { struct timespec t; if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; time = timespec_to_jiffies(&t) + 1; } pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE; /* Must be "naturally" aligned, and not on page boundary. */ if ((pos_in_page % __alignof__(int)) != 0 || pos_in_page + sizeof(int) > PAGE_SIZE) return -EINVAL; /* Simpler if it doesn't vanish underneath us. */ page = pin_page((unsigned long)uaddr - pos_in_page); if (IS_ERR(page)) return PTR_ERR(page); head = hash_futex(page, pos_in_page); switch (op) { case FUTEX_WAIT: ret = futex_wait(head, page, pos_in_page, val, time); break; case FUTEX_WAKE: ret = futex_wake(head, page, pos_in_page, val); break; case FUTEX_AWAIT: ret = futex_await(head, page, pos_in_page, val, uaddr, (int)utime); if (!ret) goto out ; /* don't release the page */ break; default: ret = -EINVAL; } put_page(page); out: return ret; } static int __init init(void) { unsigned int i; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) INIT_LIST_HEAD(&futex_queues[i]); return 0; } __initcall(init);