Re: [PATCH] rmap: Fix Bugzilla Bug #5493

From: Christian Ehrhardt
Date: Fri Mar 05 2010 - 20:09:04 EST



Hi Rik,

On Fri, Mar 05, 2010 at 09:10:53AM -0500, Rik van Riel wrote:
> Your patch will not apply against a current -mm, because it
> conflicts with my anon_vma linking patches (which attacks
> another "rmap walks too many vmas" failure mode).
>
> Please rediff your patch against the latest -mm tree.

Conflict resolution turned out to be somewhat difficult. The following
is a patch agains the -mm tree from git://zen-kernel.org/kernel/mmotm.git
of today.

It compiles but I probably won't have time to test the -mm version
before next week.

There is one caveat: I had to remove the debugging patch from
mm/prio_tree.c because this check can no longer be done at that place.
If it is still required, it should be done in a macro inside the
callers that pass vmas and not anon_vma_chains to this function.

regards Christian

Signed-off-by: Christian Ehrhardt <lk@xxxxxxx>

arch/arm/mm/fault-armv.c | 3 +-
arch/arm/mm/flush.c | 3 +-
arch/parisc/kernel/cache.c | 3 +-
arch/x86/mm/hugetlbpage.c | 3 +-
fs/hugetlbfs/inode.c | 3 +-
fs/inode.c | 2 +-
include/linux/mm.h | 28 ++++++--
include/linux/mm_types.h | 10 +---
include/linux/prio_tree.h | 17 ++++-
include/linux/rmap.h | 11 ++--
kernel/fork.c | 2 +-
lib/prio_tree.c | 14 +++-
mm/filemap_xip.c | 3 +-
mm/fremap.c | 2 +-
mm/hugetlb.c | 3 +-
mm/ksm.c | 21 +++++-
mm/memory-failure.c | 9 ++-
mm/memory.c | 5 +-
mm/mmap.c | 26 ++++----
mm/nommu.c | 12 ++--
mm/prio_tree.c | 161 ++++++++++++++++++--------------------------
mm/rmap.c | 38 +++++++---
22 files changed, 208 insertions(+), 171 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index c9b97e9..4b8d01f 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -117,7 +117,8 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
* cache coherency.
*/
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
/*
* If this VMA is not in our MM, we can ignore it.
* Note that we intentionally mask out the VMA
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index e34f095..5264230 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -194,7 +194,8 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);

flush_dcache_mmap_lock(mapping);
- vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
unsigned long offset;

/*
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index d054f3d..bf9890b 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -365,7 +365,8 @@ void flush_dcache_page(struct page *page)
* to flush one address here for them all to become coherent */

flush_dcache_mmap_lock(mapping);
- vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
addr = mpnt->vm_start + offset;

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340..f7a7954 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -74,7 +74,8 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
return;

spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+ vma_prio_tree_foreach(svma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d..1827552 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -400,7 +400,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
struct vm_area_struct *vma;
struct prio_tree_iter iter;

- vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter, root,
+ pgoff, ULONG_MAX) {
unsigned long v_offset;

/*
diff --git a/fs/inode.c b/fs/inode.c
index 2800597..3a27f74 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -262,7 +262,7 @@ void inode_init_once(struct inode *inode)
spin_lock_init(&inode->i_data.i_mmap_lock);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
- INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
+ INIT_SHARED_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
i_size_ordered_init(inode);
#ifdef CONFIG_FSNOTIFY
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb1144f..632d4c5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1200,15 +1200,29 @@ extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* prio_tree.c */
-void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
-void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
-void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+void vma_prio_tree_add(union vma_prio_tree_node *node,
+ union vma_prio_tree_node *old);
+void vma_prio_tree_insert(union vma_prio_tree_node *, struct prio_tree_root *);
+void vma_prio_tree_remove(union vma_prio_tree_node *, struct prio_tree_root *);
+union vma_prio_tree_node *vma_prio_tree_next(union vma_prio_tree_node *,
struct prio_tree_iter *iter);

-#define vma_prio_tree_foreach(vma, iter, root, begin, end) \
- for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
- (vma = vma_prio_tree_next(vma, iter)); )
+#define vma_prio_tree_first_entry(iter, type, field) ({ \
+ union vma_prio_tree_node *__t; \
+ __t = vma_prio_tree_next(NULL, iter); \
+ __t ? prio_tree_entry(__t, type, field) : NULL;\
+})
+
+#define vma_prio_tree_next_entry(obj, iter, type, field) ({ \
+ union vma_prio_tree_node *__t; \
+ __t = vma_prio_tree_next(&(obj)->field, iter); \
+ __t ? prio_tree_entry(__t, type, field) : NULL; \
+})
+
+#define vma_prio_tree_foreach(obj, type, field, iter, root, begin, end) \
+ prio_tree_iter_init(iter, root, begin, end); \
+ for (obj = vma_prio_tree_first_entry(iter, type, field); obj ; \
+ (obj = vma_prio_tree_next_entry(obj, iter, type, field)))

static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
struct list_head *list)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 048b462..06b74c1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -147,15 +147,7 @@ struct vm_area_struct {
* linkage to the list of like vmas hanging off its node, or
* linkage of vma in the address_space->i_mmap_nonlinear list.
*/
- union {
- struct {
- struct list_head list;
- void *parent; /* aligns with prio_tree_node parent */
- struct vm_area_struct *head;
- } vm_set;
-
- struct raw_prio_tree_node prio_tree_node;
- } shared;
+ union vma_prio_tree_node shared;

/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h
index db04abb..ae683b2 100644
--- a/include/linux/prio_tree.h
+++ b/include/linux/prio_tree.h
@@ -25,13 +25,25 @@ struct prio_tree_node {
unsigned long last; /* last location _in_ interval */
};

+union vma_prio_tree_node {
+ struct {
+ struct list_head list;
+ void *parent;
+ union vma_prio_tree_node *head;
+ } vm_set;
+ struct raw_prio_tree_node prio_tree_node;
+};
+
struct prio_tree_root {
struct prio_tree_node *prio_tree_node;
unsigned short index_bits;
unsigned short raw;
/*
* 0: nodes are of type struct prio_tree_node
- * 1: nodes are of type raw_prio_tree_node
+ * 1: nodes are of type raw_prio_tree_node and the vmas
+ * use the shared field.
+ * 2: nodes are of type raw_prio_tree_node and the vmas
+ * use the anon field.
*/
};

@@ -63,7 +75,8 @@ do { \
} while (0)

#define INIT_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 0)
-#define INIT_RAW_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 1)
+#define INIT_SHARED_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 1)
+#define INIT_ANON_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 2)

#define INIT_PRIO_TREE_NODE(ptr) \
do { \
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d25bd22..9e9a521 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
+#include <linux/prio_tree.h>
#include <linux/memcontrol.h>

/*
@@ -30,14 +31,14 @@ struct anon_vma {
atomic_t ksm_refcount;
#endif
/*
- * NOTE: the LSB of the head.next is set by
+ * NOTE: the LSB of the head.prio_tree_node is set by
* mm_take_all_locks() _after_ taking the above lock. So the
* head must only be read/written after taking the above lock
- * to be sure to see a valid next pointer. The LSB bit itself
- * is serialized by a system wide lock only visible to
+ * to be sure to see a valid prio_tree_node pointer. The LSB bit
+ * itself is serialized by a system wide lock only visible to
* mm_take_all_locks() (mm_all_locks_mutex).
*/
- struct list_head head; /* Chain of private "related" vmas */
+ struct prio_tree_root head;
};

/*
@@ -57,7 +58,7 @@ struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
- struct list_head same_anon_vma; /* locked by anon_vma->lock */
+ union vma_prio_tree_node same_anon_vma; /* locked by anon_vma->lock */
};

#ifdef CONFIG_MMU
diff --git a/kernel/fork.c b/kernel/fork.c
index b54abc4..51b16ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -355,7 +355,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ vma_prio_tree_add(&tmp->shared, &mpnt->shared);
flush_dcache_mmap_unlock(mapping);
spin_unlock(&mapping->i_mmap_lock);
}
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index ccfd850..1d48709 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/prio_tree.h>
+#include <linux/rmap.h>

/*
* A clever mix of heap and radix trees forms a radix priority search tree (PST)
@@ -53,14 +54,21 @@ static void get_index(const struct prio_tree_root *root,
const struct prio_tree_node *node,
unsigned long *radix, unsigned long *heap)
{
- if (root->raw) {
+ if (root->raw == 1) {
struct vm_area_struct *vma = prio_tree_entry(
node, struct vm_area_struct, shared.prio_tree_node);

*radix = RADIX_INDEX(vma);
*heap = HEAP_INDEX(vma);
- }
- else {
+ } else if (root->raw == 2) {
+ struct vm_area_struct *vma;
+
+ vma = prio_tree_entry(node, struct anon_vma_chain,
+ same_anon_vma.prio_tree_node)->vma;
+
+ *radix = RADIX_INDEX(vma);
+ *heap = HEAP_INDEX(vma);
+ } else {
*radix = node->start;
*heap = node->last;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 78b94f0..f0e36fe 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,8 @@ __xip_unmap (struct address_space * mapping,

retry:
spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dac..dd0853c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,7 +211,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
spin_lock(&mapping->i_mmap_lock);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..bbe3c0e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2245,7 +2245,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* __unmap_hugepage_range() is called as the lock is already held
*/
spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(iter_vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7..2eded1e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -326,7 +326,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
struct anon_vma *anon_vma = rmap_item->anon_vma;

if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
+ int empty = prio_tree_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
@@ -1562,12 +1562,17 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
return 0;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;

spin_lock(&anon_vma->lock);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+
+ vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+ same_anon_vma, &iter,
+ &anon_vma->head, pgoff, pgoff) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1615,12 +1620,16 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
return SWAP_FAIL;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;

spin_lock(&anon_vma->lock);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+ same_anon_vma, &iter,
+ &anon_vma->head, pgoff, pgoff) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1667,12 +1676,16 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
return ret;
again:
hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;

spin_lock(&anon_vma->lock);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+ same_anon_vma, &iter,
+ &anon_vma->head, pgoff, pgoff) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d1f3351..9ebe34c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,11 +383,14 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (av == NULL) /* Not actually mapped anymore */
goto out;
for_each_process (tsk) {
+ struct prio_tree_iter iter;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *vmac;

if (!task_early_kill(tsk))
continue;
- list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+ same_anon_vma, &iter, &av->head, pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
@@ -428,8 +431,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
if (!task_early_kill(tsk))
continue;

- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
- pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared,
+ &iter, &mapping->i_mmap, pgoff, pgoff) {
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
diff --git a/mm/memory.c b/mm/memory.c
index f531087..af760c1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2411,7 +2411,8 @@ static void reset_vma_truncate_counts(struct address_space *mapping)
struct vm_area_struct *vma;
struct prio_tree_iter iter;

- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared,
+ &iter, &mapping->i_mmap, 0, ULONG_MAX)
vma->vm_truncate_count = 0;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_truncate_count = 0;
@@ -2472,7 +2473,7 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
pgoff_t vba, vea, zba, zea;

restart:
- vma_prio_tree_foreach(vma, &iter, root,
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter, root,
details->first_index, details->last_index) {
/* Skip quickly over those we have already dealt with */
if (vma->vm_truncate_count == details->truncate_count)
diff --git a/mm/mmap.c b/mm/mmap.c
index 6cfd507..4497e79 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -207,7 +207,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_NONLINEAR))
list_del_init(&vma->shared.vm_set.list);
else
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}

@@ -430,7 +430,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
if (unlikely(vma->vm_flags & VM_NONLINEAR))
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
else
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_prio_tree_insert(&vma->shared, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -593,9 +593,9 @@ again: remove_next = 1 + (end > next->vm_end);

if (root) {
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, root);
+ vma_prio_tree_remove(&vma->shared, root);
if (adjust_next)
- vma_prio_tree_remove(next, root);
+ vma_prio_tree_remove(&next->shared, root);
}

vma->vm_start = start;
@@ -608,8 +608,8 @@ again: remove_next = 1 + (end > next->vm_end);

if (root) {
if (adjust_next)
- vma_prio_tree_insert(next, root);
- vma_prio_tree_insert(vma, root);
+ vma_prio_tree_insert(&next->shared, root);
+ vma_prio_tree_insert(&vma->shared, root);
flush_dcache_mmap_unlock(mapping);
}

@@ -866,7 +866,7 @@ try_prev:
* It is potentially slow to have to call find_vma_prev here.
* But it's only on the first write fault on the vma, not
* every time, and we could devise a way to avoid it later
- * (e.g. stash info in next's anon_vma_node when assigning
+ * (e.g. stash info in next's anon node when assigning
* an anon_vma, or when trying vma_merge). Another time.
*/
BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
@@ -2440,7 +2440,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->head.prio_tree_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
@@ -2456,7 +2456,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* anon_vma->lock.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->head.prio_tree_node))
BUG();
}
}
@@ -2497,8 +2497,8 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* A single task can't take more than one mm_take_all_locks() in a row
* or it would deadlock.
*
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
- * mapping->flags avoid to take the same lock twice, if more than one
+ * The LSB in anon_vma->head.prio_tree_node and the AS_MM_ALL_LOCKS bitflag
+ * in mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
@@ -2547,7 +2547,7 @@ out_unlock:

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->head.prio_tree_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
@@ -2561,7 +2561,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
* anon_vma->lock.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->head.prio_tree_node))
BUG();
spin_unlock(&anon_vma->lock);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8..aa4f77a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -627,7 +627,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
mapping = vma->vm_file->f_mapping;

flush_dcache_mmap_lock(mapping);
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_prio_tree_insert(&vma->shared, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}

@@ -695,7 +695,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
mapping = vma->vm_file->f_mapping;

flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}

@@ -1965,8 +1965,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
down_write(&nommu_region_sem);

/* search for VMAs that fall within the dead zone */
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- low, high) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &inode->i_mapping->i_mmap, low, high) {
/* found one - only interested if it's shared out of the page
* cache */
if (vma->vm_flags & VM_SHARED) {
@@ -1981,8 +1981,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
* we don't check for any regions that start beyond the EOF as there
* shouldn't be any
*/
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- 0, ULONG_MAX) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
if (!(vma->vm_flags & VM_SHARED))
continue;

diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index c297a46..8e9194f 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -67,114 +67,85 @@
* vma->shared.vm_set.head == NULL ==> a list node
*/

-static void dump_vma(struct vm_area_struct *vma)
-{
- void **ptr = (void **) vma;
- int i;
-
- printk("vm_area_struct at %p:", ptr);
- for (i = 0; i < sizeof(*vma)/sizeof(*ptr); i++, ptr++) {
- if (!(i & 3))
- printk("\n");
- printk(" %p", *ptr);
- }
- printk("\n");
-}
-
/*
* Add a new vma known to map the same set of pages as the old vma:
* useful for fork's dup_mmap as well as vma_prio_tree_insert below.
* Note that it just happens to work correctly on i_mmap_nonlinear too.
*/
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
+void vma_prio_tree_add(union vma_prio_tree_node *node,
+ union vma_prio_tree_node *old)
{
- vma->shared.vm_set.head = NULL;
- vma->shared.vm_set.parent = NULL;
-
- if (WARN_ON(RADIX_INDEX(vma) != RADIX_INDEX(old) ||
- HEAP_INDEX(vma) != HEAP_INDEX(old))) {
- /*
- * This should never happen, yet it has been seen a few times:
- * we cannot say much about it without seeing the vma contents.
- */
- dump_vma(vma);
- dump_vma(old);
- /*
- * Don't try to link this (corrupt?) vma into the (corrupt?)
- * prio_tree, but arrange for its removal to succeed later.
- */
- INIT_LIST_HEAD(&vma->shared.vm_set.list);
- } else if (!old->shared.vm_set.parent)
- list_add(&vma->shared.vm_set.list,
- &old->shared.vm_set.list);
- else if (old->shared.vm_set.head)
- list_add_tail(&vma->shared.vm_set.list,
- &old->shared.vm_set.head->shared.vm_set.list);
+ node->vm_set.head = NULL;
+ node->vm_set.parent = NULL;
+
+ if (!old->vm_set.parent)
+ list_add(&node->vm_set.list, &old->vm_set.list);
+ else if (old->vm_set.head)
+ list_add_tail(&node->vm_set.list,
+ &old->vm_set.head->vm_set.list);
else {
- INIT_LIST_HEAD(&vma->shared.vm_set.list);
- vma->shared.vm_set.head = old;
- old->shared.vm_set.head = vma;
+ INIT_LIST_HEAD(&node->vm_set.list);
+ node->vm_set.head = old;
+ old->vm_set.head = node;
}
}

-void vma_prio_tree_insert(struct vm_area_struct *vma,
+void vma_prio_tree_insert(union vma_prio_tree_node *node,
struct prio_tree_root *root)
{
struct prio_tree_node *ptr;
- struct vm_area_struct *old;
+ union vma_prio_tree_node *old;

- vma->shared.vm_set.head = NULL;
+ node->vm_set.head = NULL;

- ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
- if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
- old = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- vma_prio_tree_add(vma, old);
+ ptr = raw_prio_tree_insert(root, &node->prio_tree_node);
+ if (ptr != (struct prio_tree_node *) &node->prio_tree_node) {
+ old = prio_tree_entry(ptr, union vma_prio_tree_node,
+ prio_tree_node);
+ vma_prio_tree_add(node, old);
}
}

-void vma_prio_tree_remove(struct vm_area_struct *vma,
+void vma_prio_tree_remove(union vma_prio_tree_node *target,
struct prio_tree_root *root)
{
- struct vm_area_struct *node, *head, *new_head;
+ union vma_prio_tree_node *node, *head, *new_head;

- if (!vma->shared.vm_set.head) {
- if (!vma->shared.vm_set.parent)
- list_del_init(&vma->shared.vm_set.list);
+ if (!target->vm_set.head) {
+ if (!target->vm_set.parent)
+ list_del_init(&target->vm_set.list);
else
- raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
+ raw_prio_tree_remove(root, &target->prio_tree_node);
} else {
/* Leave this BUG_ON till prio_tree patch stabilizes */
- BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
- if (vma->shared.vm_set.parent) {
- head = vma->shared.vm_set.head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
+ BUG_ON(target->vm_set.head->vm_set.head != target);
+ if (target->vm_set.parent) {
+ head = target->vm_set.head;
+ if (!list_empty(&head->vm_set.list)) {
+ new_head = prio_tree_entry(
+ head->vm_set.list.next,
+ union vma_prio_tree_node, vm_set.list);
+ list_del_init(&head->vm_set.list);
} else
new_head = NULL;

- raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->shared.vm_set.head = new_head;
+ raw_prio_tree_replace(root, &target->prio_tree_node,
+ &head->prio_tree_node);
+ head->vm_set.head = new_head;
if (new_head)
- new_head->shared.vm_set.head = head;
+ new_head->vm_set.head = head;

} else {
- node = vma->shared.vm_set.head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->shared.vm_set.head = new_head;
- new_head->shared.vm_set.head = node;
+ node = target->vm_set.head;
+ if (!list_empty(&target->vm_set.list)) {
+ new_head = prio_tree_entry(
+ target->vm_set.list.next,
+ union vma_prio_tree_node, vm_set.list);
+ list_del_init(&target->vm_set.list);
+ node->vm_set.head = new_head;
+ new_head->vm_set.head = node;
} else
- node->shared.vm_set.head = NULL;
+ node->vm_set.head = NULL;
}
}
}
@@ -184,46 +155,46 @@ void vma_prio_tree_remove(struct vm_area_struct *vma,
* contiguous file pages. The function returns vmas that at least map a single
* page in the given range of contiguous file pages.
*/
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+union vma_prio_tree_node *vma_prio_tree_next(union vma_prio_tree_node *node,
struct prio_tree_iter *iter)
{
struct prio_tree_node *ptr;
- struct vm_area_struct *next;
+ union vma_prio_tree_node *next;

- if (!vma) {
+ if (!node) {
/*
- * First call is with NULL vma
+ * First call is with NULL node
*/
ptr = prio_tree_next(iter);
if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
+ next = prio_tree_entry(ptr, union vma_prio_tree_node,
+ prio_tree_node);
+ prefetch(next->vm_set.head);
return next;
} else
return NULL;
}

- if (vma->shared.vm_set.parent) {
- if (vma->shared.vm_set.head) {
- next = vma->shared.vm_set.head;
- prefetch(next->shared.vm_set.list.next);
+ if (node->vm_set.parent) {
+ if (node->vm_set.head) {
+ next = node->vm_set.head;
+ prefetch(next->vm_set.list.next);
return next;
}
} else {
- next = list_entry(vma->shared.vm_set.list.next,
- struct vm_area_struct, shared.vm_set.list);
- if (!next->shared.vm_set.head) {
- prefetch(next->shared.vm_set.list.next);
+ next = list_entry(node->vm_set.list.next,
+ union vma_prio_tree_node, vm_set.list);
+ if (!next->vm_set.head) {
+ prefetch(next->vm_set.list.next);
return next;
}
}

ptr = prio_tree_next(iter);
if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
+ next = prio_tree_entry(ptr, union vma_prio_tree_node,
+ prio_tree_node);
+ prefetch(next->vm_set.head);
return next;
} else
return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c..34391d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -142,7 +142,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
avc->anon_vma = anon_vma;
avc->vma = vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- list_add(&avc->same_anon_vma, &anon_vma->head);
+ vma_prio_tree_insert(&avc->same_anon_vma,
+ &anon_vma->head);
allocated = NULL;
}
spin_unlock(&mm->page_table_lock);
@@ -170,7 +171,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
list_add(&avc->same_vma, &vma->anon_vma_chain);

spin_lock(&anon_vma->lock);
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ vma_prio_tree_insert(&avc->same_anon_vma, &anon_vma->head);
spin_unlock(&anon_vma->lock);
}

@@ -245,10 +246,10 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
return;

spin_lock(&anon_vma->lock);
- list_del(&anon_vma_chain->same_anon_vma);
+ vma_prio_tree_remove(&anon_vma_chain->same_anon_vma, &anon_vma->head);

/* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
+ empty = prio_tree_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
spin_unlock(&anon_vma->lock);

if (empty)
@@ -273,7 +274,7 @@ static void anon_vma_ctor(void *data)

spin_lock_init(&anon_vma->lock);
ksm_refcount_init(anon_vma);
- INIT_LIST_HEAD(&anon_vma->head);
+ INIT_ANON_PRIO_TREE_ROOT(&anon_vma->head);
}

void __init anon_vma_init(void)
@@ -483,9 +484,11 @@ static int page_referenced_anon(struct page *page,
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned int mapcount;
struct anon_vma *anon_vma;
struct anon_vma_chain *avc;
+ struct prio_tree_iter iter;
int referenced = 0;

anon_vma = page_lock_anon_vma(page);
@@ -493,7 +496,8 @@ static int page_referenced_anon(struct page *page,
return referenced;

mapcount = page_mapcount(page);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+ &iter, &anon_vma->head, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -562,7 +566,8 @@ static int page_referenced_file(struct page *page,
*/
mapcount = page_mapcount(page);

- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -667,7 +672,8 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
BUG_ON(PageAnon(page));

spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1132,15 +1138,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
*/
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma *anon_vma;
struct anon_vma_chain *avc;
+ struct prio_tree_iter iter;
int ret = SWAP_AGAIN;

anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;

- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+ &iter, &anon_vma->head, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1182,7 +1191,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned int mapcount;

spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;
@@ -1331,9 +1341,11 @@ int try_to_munlock(struct page *page)
static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma *anon_vma;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
+ struct prio_tree_iter iter;

/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
@@ -1347,7 +1359,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (!anon_vma)
return ret;
spin_lock(&anon_vma->lock);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+ &iter, &anon_vma->head, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1372,7 +1385,8 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
if (!mapping)
return ret;
spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+ &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
continue;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/