Re: [linux-next: Tree for Jun 1] __khugepaged_exit rwsem_down_write_failed lockup

From: Sergey Senozhatsky
Date: Fri Jun 03 2016 - 04:43:59 EST


On (06/03/16 09:25), Michal Hocko wrote:
> > it's quite hard to trigger the bug (somehow), so I can't
> > follow up with more information as of now.

either I did something very silly fixing up the patch, or the
patch may be causing general protection faults on my system.

RIP collect_mm_slot() + 0x42/0x84
khugepaged
prepare_to_wait_event
maybe_pmd_mkwrite
kthread
_raw_sin_unlock_irq
ret_from_fork
kthread_create_on_node

collect_mm_slot() + 0x42/0x84 is

0000000000000328 <collect_mm_slot>:
328: 55 push %rbp
329: 48 89 e5 mov %rsp,%rbp
32c: 53 push %rbx
32d: 48 8b 5f 20 mov 0x20(%rdi),%rbx
331: 8b 43 48 mov 0x48(%rbx),%eax
334: ff c8 dec %eax
336: 7f 71 jg 3a9 <collect_mm_slot+0x81>
338: 48 8b 57 08 mov 0x8(%rdi),%rdx
33c: 48 85 d2 test %rdx,%rdx
33f: 74 1e je 35f <collect_mm_slot+0x37>
341: 48 8b 07 mov (%rdi),%rax
344: 48 85 c0 test %rax,%rax
347: 48 89 02 mov %rax,(%rdx)
34a: 74 04 je 350 <collect_mm_slot+0x28>
34c: 48 89 50 08 mov %rdx,0x8(%rax)
350: 48 c7 07 00 00 00 00 movq $0x0,(%rdi)
357: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi)
35e: 00
35f: 48 8b 57 10 mov 0x10(%rdi),%rdx
363: 48 8b 47 18 mov 0x18(%rdi),%rax
367: 48 89 fe mov %rdi,%rsi
36a: 48 89 42 08 mov %rax,0x8(%rdx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36e: 48 89 10 mov %rdx,(%rax)
371: 48 b8 00 01 00 00 00 movabs $0xdead000000000100,%rax
378: 00 ad de
37b: 48 89 47 10 mov %rax,0x10(%rdi)
37f: 48 b8 00 02 00 00 00 movabs $0xdead000000000200,%rax
386: 00 ad de
389: 48 89 47 18 mov %rax,0x18(%rdi)
38d: 48 8b 3d 00 00 00 00 mov 0x0(%rip),%rdi # 394 <collect_mm_slot+0x6c>
394: e8 00 00 00 00 callq 399 <collect_mm_slot+0x71>
399: f0 ff 4b 4c lock decl 0x4c(%rbx)
39d: 74 02 je 3a1 <collect_mm_slot+0x79>
39f: eb 08 jmp 3a9 <collect_mm_slot+0x81>
3a1: 48 89 df mov %rbx,%rdi
3a4: e8 00 00 00 00 callq 3a9 <collect_mm_slot+0x81>
3a9: 5b pop %rbx
3aa: 5d pop %rbp
3ab: c3 retq

which is list_del(&mm_slot->mm_node), I believe.


I attached the patch (just in case).

---
mm/huge_memory.c | 87 +++++++++++++++++++++++++-------------------------------
1 file changed, 39 insertions(+), 48 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292cedd..1c82fa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1938,7 +1938,8 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,

static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ /* the only pin is from khugepaged_scan_mm_slot */
+ return atomic_read(&mm->mm_users) <= 1;
}

int __khugepaged_enter(struct mm_struct *mm)
@@ -1950,8 +1951,6 @@ int __khugepaged_enter(struct mm_struct *mm)
if (!mm_slot)
return -ENOMEM;

- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -1994,36 +1993,40 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
return 0;
}

-void __khugepaged_exit(struct mm_struct *mm)
+static void collect_mm_slot(struct mm_slot *mm_slot)
{
- struct mm_slot *mm_slot;
- int free = 0;
+ struct mm_struct *mm = mm_slot->mm;

- spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
- free = 1;
- }
- spin_unlock(&khugepaged_mm_lock);

- if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
- mmdrop(mm);
- } else if (mm_slot) {
/*
- * This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_sem.
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot) {
+ collect_mm_slot(mm_slot);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
}
+ spin_unlock(&khugepaged_mm_lock);
}

static void release_pte_page(struct page *page)
@@ -2738,29 +2741,6 @@ out:
return ret;
}

-static void collect_mm_slot(struct mm_slot *mm_slot)
-{
- struct mm_struct *mm = mm_slot->mm;
-
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
- if (khugepaged_test_exit(mm)) {
- /* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
-
- /*
- * Not strictly needed because the mm exited already.
- *
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- */
-
- /* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
- mmdrop(mm);
- }
-}
-
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
@@ -2782,6 +2762,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
+
+ /*
+ * Do not even try to do anything if the current mm is already
+ * dead. khugepaged_mm_lock will make sure only this or
+ * __khugepaged_exit does the unhasing.
+ */
+ if (!atomic_inc_not_zero(&mm_slot->mm->mm_users)) {
+ collect_mm_slot(mm_slot);
+ return progress;
+ }
spin_unlock(&khugepaged_mm_lock);

mm = mm_slot->mm;
@@ -2865,6 +2855,7 @@ breakouterloop_mmap_sem:

collect_mm_slot(mm_slot);
}
+ mmput_async(mm);

return progress;
}
--
2.9.0.rc1