Signed-off-by: Eric Dumazet <dada1@xxxxxxxxxxxxx>
--- linux-2.6.21-rc5-mm4/kernel/futex.c
+++ linux-2.6.21-rc5-mm4-ed/kernel/futex.c
@@ -16,6 +16,9 @@
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@xxxxxxxxxxx>
*
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@xxxxxxxxxxxxx>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -199,9 +202,12 @@ static inline int match_futex(union fute
* Returns: 0, or negative error code.
* The key words are stored in *key on success.
*
- * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks.
+ * shared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to ¤t->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
*/
-int get_futex_key(void __user *uaddr, union futex_key *key)
+int get_futex_key(void __user *uaddr, union futex_key *key,
+ struct rw_semaphore *shared)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -218,6 +224,22 @@ int get_futex_key(void __user *uaddr, un
address -= key->both.offset;
/*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'address' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ * Note : At this point, address points to the start of page,
+ * not the real futex address, this is ok.
+ */
+ if (!shared) {
+ if (!access_ok(VERIFY_WRITE, address, sizeof(int)))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
@@ -244,6 +266,7 @@ int get_futex_key(void __user *uaddr, un
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset += FUT_OFF_MMSHARED; /* reference taken on mm */
key->private.mm = mm;
key->private.address = address;
return 0;
@@ -253,7 +276,7 @@ int get_futex_key(void __user *uaddr, un
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ key->both.offset += FUT_OFF_INODE; /* inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
@@ -281,17 +304,19 @@ EXPORT_SYMBOL_GPL(get_futex_key);
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
inline void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
atomic_inc(&key->shared.inode->i_count);
- else
+ break;
+ case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
- }
+ break;
+ }
}
EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -301,11 +326,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
*/
void drop_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
iput(key->shared.inode);
- else
+ break;
+ case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
+ break;
}
}
EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -339,28 +368,40 @@ get_futex_value_locked(unsigned long *de
}
/*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if shared is non NULL, current->mm->mmap_sem is already held
*/
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address, int attempt,
+ struct rw_semaphore *shared)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
+ int ret = 0;
- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
- vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+ if (attempt > 2)
return -EFAULT;
- switch (handle_mm_fault(mm, vma, address, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
- return -EFAULT;
- }
- return 0;
+ if (!shared)
+ down_read(&mm->mmap_sem);
+
+ if (!(vma = find_vma(mm, address)) ||
+ vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+ ret = -EFAULT;
+
+ else
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ default:
+ ret = -EFAULT;
+ }
+ if (!shared)
+ up_read(&mm->mmap_sem);
+ return ret;
}
/*
@@ -1598,6 +1656,8 @@ static int futex_wait(unsigned long __us
restart->arg1 = val;
restart->arg2 = (unsigned long)abs_time;
restart->arg3 = (unsigned long)futex64;
+ if (shared)
+ restart->arg3 |= 2;
@@ -2377,23 +2455,24 @@ sys_futex64(u64 __user *uaddr, int op, u
struct timespec ts;
ktime_t t, *tp = NULL;
u64 val2 = 0;
+ int opm = op & FUTEX_CMD_MASK;
- if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+ if (utime && (opm == FUTEX_WAIT || opm == FUTEX_LOCK_PI)) {