[PATCH RFC] mm/mmu_notifier: get rid of srcu-based synchronization

From: Konstantin Khlebnikov
Date: Mon Jan 21 2013 - 06:57:19 EST


This patch removes srcu-based protection from mmu-notitifier and collects all
synchronization in mmu_notifier_unregister()/mmu_notifier_release().

All mmu notifier methods are called either under mmap_sem or under one of rmap
locks: root anon vma lock or inode mmaping lock. Because they always operates in
particular vma and caller must protect it somehow. Thus we can use mmap_sem and
rmap locks for waiting for all currently running mmu-notifier methods.

This patch adds new helper function: mm_synchronize_all_locks(). This function
acquires and releases all rmap locks one by one, it much faster than sequence
mm_take_all_locks() - mm_drop_all_locks(). mm_synchronize_all_locks() needs only
mmap_sem read-lock, but caller must lock mmap_sem for write at least once to
synchronize with the rest operations which are protected with mmap_sem read-lock.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
---
include/linux/mm.h | 1
include/linux/mmu_notifier.h | 1
mm/mmap.c | 26 ++++++++++++
mm/mmu_notifier.c | 91 ++++++++++++++----------------------------
4 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66e2f7c..86cf9f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1461,6 +1461,7 @@ extern void exit_mmap(struct mm_struct *);

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);
+extern void mm_synchronize_all_locks(struct mm_struct *mm);

extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index bc823c4..fa55d89 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -4,7 +4,6 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
-#include <linux/srcu.h>

struct mmu_notifier;
struct mmu_notifier_ops;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0fb6805..89e884e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3043,6 +3043,32 @@ void mm_drop_all_locks(struct mm_struct *mm)
}

/*
+ * This function waits for all running pte operations which are protected with
+ * rmap locks like try_to_unmap(). To wait for rest operations like page-faults
+ * which runs under mmap_sem caller must lock mmap_sem for write at least once.
+ * This function itself requires only mmap_sem locked for read.
+ */
+void mm_synchronize_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ BUG_ON(down_write_trylock(&mm->mmap_sem));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ if (vma->vm_file && vma->vm_file->f_mapping) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ }
+ }
+}
+
+/*
* initialise the VMA slab
*/
void __init mmap_init(void)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c..6b77a9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,37 +14,31 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
-#include <linux/srcu.h>
-#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>

-/* global SRCU for all MMs */
-static struct srcu_struct srcu;
-
/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
* runs with mm_users == 0. Other tasks may still invoke mmu notifiers
* in parallel despite there being no task using this mm any more,
* through the vmas outside of the exit_mmap context, such as with
- * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
- * can't go away from under us as exit_mmap holds an mm_count pin
- * itself.
+ * unmap_mapping_range(). This serializes against mmu_notifier_unregister()
+ * and other mmu notifiers with the mm->mmap_sem and the mmu_notifier_mm->lock.
+ * struct mmu_notifier_mm can't go away from under us as exit_mmap holds
+ * an mm_count pin itself.
*/
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int id;

/*
- * SRCU here will block mmu_notifier_unregister until
- * ->release returns.
+ * Block mmu_notifier_unregister() until ->release returns
+ * and synchronize with concurrent page-faults.
*/
- id = srcu_read_lock(&srcu);
+ down_write(&mm->mmap_sem);
+
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
/*
* if ->release runs before mmu_notifier_unregister it
@@ -55,7 +49,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- srcu_read_unlock(&srcu, id);

spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -73,15 +66,19 @@ void __mmu_notifier_release(struct mm_struct *mm)
spin_unlock(&mm->mmu_notifier_mm->lock);

/*
- * synchronize_srcu here prevents mmu_notifier_release to
+ * locked mm->mmap_sem here prevents mmu_notifier_release to
* return to exit_mmap (which would proceed freeing all pages
* in the mm) until the ->release method returns, if it was
* invoked by mmu_notifier_unregister.
*
* The mmu_notifier_mm can't go away from under us because one
* mm_count is hold by exit_mmap.
+ *
+ * Explicit synchronization with mmu notifier methods isn't
+ * required because exit_mmap() calls free_pgtables() after us,
+ * which locks/unlocks all locks like mm_synchronize_all_locks().
*/
- synchronize_srcu(&srcu);
+ up_write(&mm->mmap_sem);
}

/*
@@ -94,14 +91,12 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0, id;
+ int young = 0;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
young |= mn->ops->clear_flush_young(mn, mm, address);
}
- srcu_read_unlock(&srcu, id);

return young;
}
@@ -111,9 +106,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0, id;
+ int young = 0;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->test_young) {
young = mn->ops->test_young(mn, mm, address);
@@ -121,7 +115,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
break;
}
}
- srcu_read_unlock(&srcu, id);

return young;
}
@@ -131,14 +124,11 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int id;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
}
- srcu_read_unlock(&srcu, id);
}

void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,14 +136,11 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int id;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
- srcu_read_unlock(&srcu, id);
}

void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -161,14 +148,11 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int id;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);
}
- srcu_read_unlock(&srcu, id);
}

void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -176,14 +160,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int id;

- id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
- srcu_read_unlock(&srcu, id);
}

static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -195,12 +176,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,

BUG_ON(atomic_read(&mm->mm_users) <= 0);

- /*
- * Verify that mmu_notifier_init() already run and the global srcu is
- * initialized.
- */
- BUG_ON(!srcu.per_cpu_ref);
-
ret = -ENOMEM;
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
if (unlikely(!mmu_notifier_mm))
@@ -283,8 +258,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
/*
* This releases the mm_count pin automatically and frees the mm
* structure if it was the last user of it. It serializes against
- * running mmu notifiers with SRCU and against mmu_notifier_unregister
- * with the unregister lock + SRCU. All sptes must be dropped before
+ * mmu_notifier_unregister() with mmap_sem and agaings mmu notifiers
+ * with the mmap_sem + rmap locks. All sptes must be dropped before
* calling mmu_notifier_unregister. ->release or any other notifier
* method may be invoked concurrently with mmu_notifier_unregister,
* and only after mmu_notifier_unregister returned we're guaranteed
@@ -294,14 +269,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);

- if (!hlist_unhashed(&mn->hlist)) {
- /*
- * SRCU here will force exit_mmap to wait ->release to finish
- * before freeing the pages.
- */
- int id;
+ /*
+ * Synchronize with concurrent mmu_notifier_release() and page-faults.
+ */
+ down_write(&mm->mmap_sem);

- id = srcu_read_lock(&srcu);
+ if (!hlist_unhashed(&mn->hlist)) {
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -309,28 +282,24 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- srcu_read_unlock(&srcu, id);

spin_lock(&mm->mmu_notifier_mm->lock);
hlist_del_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
}

+ downgrade_write(&mm->mmap_sem);
/*
- * Wait any running method to finish, of course including
- * ->release if it was run by mmu_notifier_relase instead of us.
+ * Wait any running method to finish. All such operations are protected
+ * either with mm->mmap_sem like handle_pte_fault() or with rmap locks
+ * like try_to_unmap(). They need one of these locks to protect vma/pte
+ * area where they operates.
*/
- synchronize_srcu(&srcu);
+ mm_synchronize_all_locks(mm);
+ up_read(&mm->mmap_sem);

BUG_ON(atomic_read(&mm->mm_count) <= 0);

mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
-
-static int __init mmu_notifier_init(void)
-{
- return init_srcu_struct(&srcu);
-}
-
-module_init(mmu_notifier_init);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/