[RFC v3 06/17] mm: Provide speculative fault infrastructure

From: Laurent Dufour
Date: Thu Apr 27 2017 - 11:53:29 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
[Fix newly introduced pte_spinlock() for speculative page fault]
Signed-off-by: Laurent Dufour <ldufour@xxxxxxxxxxxxxxxxxx>
---
include/linux/mm.h | 3 ++
mm/memory.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 555ac9ac7202..4667be54ba74 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -298,6 +298,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
+ unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pte_t orig_pte; /* Value of PTE at the time of fault */
@@ -1237,6 +1238,8 @@ int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+extern int handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address, unsigned int flags);
extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/mm/memory.c b/mm/memory.c
index 0f7fbee554c4..fd3a0dc122c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2104,15 +2104,66 @@ static inline void wp_page_reuse(struct vm_fault *vmf)

static bool pte_spinlock(struct vm_fault *vmf)
{
+ bool ret = false;
+
+ /* Check if vma is still valid */
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ return true;
+ }
+
+ local_irq_disable();
+ if (vma_is_dead(vmf->vma, vmf->sequence))
+ goto out;
+
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- return true;
+
+ if (vma_is_dead(vmf->vma, vmf->sequence)) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
}

static bool pte_map_lock(struct vm_fault *vmf)
{
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
- return true;
+ bool ret = false;
+
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ return true;
+ }
+
+ /*
+ * The first vma_is_dead() guarantees the page-tables are still valid,
+ * having IRQs disabled ensures they stay around, hence the second
+ * vma_is_dead() to make sure they are still valid once we've got the
+ * lock. After that a concurrent zap_pte_range() will block on the PTL
+ * and thus we're safe.
+ */
+ local_irq_disable();
+ if (vma_is_dead(vmf->vma, vmf->sequence))
+ goto out;
+
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+
+ if (vma_is_dead(vmf->vma, vmf->sequence)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
}

/*
@@ -2544,6 +2595,7 @@ int do_swap_page(struct vm_fault *vmf)
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
+ /* XXX fe->pmd might be dead */
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_hwpoison_entry(entry)) {
@@ -3659,6 +3711,97 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
return handle_pte_fault(&vmf);
}

+int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+ unsigned int flags)
+{
+ struct vm_fault vmf = {
+ .address = address,
+ .flags = flags | FAULT_FLAG_SPECULATIVE,
+ };
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ int dead, seq, idx, ret = VM_FAULT_RETRY;
+ struct vm_area_struct *vma;
+
+ idx = srcu_read_lock(&vma_srcu);
+ vma = find_vma_srcu(mm, address);
+ if (!vma)
+ goto unlock;
+
+ /*
+ * Validate the VMA found by the lockless lookup.
+ */
+ dead = RB_EMPTY_NODE(&vma->vm_rb);
+ seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
+ if ((seq & 1) || dead) /* XXX wait for !&1 instead? */
+ goto unlock;
+
+ if (address < vma->vm_start || vma->vm_end <= address)
+ goto unlock;
+
+ /*
+ * We need to re-validate the VMA after checking the bounds, otherwise
+ * we might have a false positive on the bounds.
+ */
+ if (read_seqcount_retry(&vma->vm_sequence, seq))
+ goto unlock;
+
+ /*
+ * Do a speculative lookup of the PTE entry.
+ */
+ local_irq_disable();
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto out_walk;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto out_walk;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto out_walk;
+
+ /*
+ * The above does not allocate/instantiate page-tables because doing so
+ * would lead to the possibility of instantiating page-tables after
+ * free_pgtables() -- and consequently leaking them.
+ *
+ * The result is that we take at least one !speculative fault per PMD
+ * in order to instantiate it.
+ *
+ * XXX try and fix that.. should be possible somehow.
+ */
+
+ if (pmd_huge(*pmd)) /* XXX no huge support */
+ goto out_walk;
+
+ vmf.vma = vma;
+ vmf.pmd = pmd;
+ vmf.pgoff = linear_page_index(vma, address);
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
+ vmf.sequence = seq;
+
+#if 0
+#warning This is done in handle_pte_fault()...
+ pte = pte_offset_map(pmd, address);
+ fe.entry = ACCESS_ONCE(pte); /* XXX gup_get_pte() */
+ pte_unmap(pte);
+#endif
+ local_irq_enable();
+
+ ret = handle_pte_fault(&vmf);
+
+unlock:
+ srcu_read_unlock(&vma_srcu, idx);
+ return ret;
+
+out_walk:
+ local_irq_enable();
+ goto unlock;
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
--
2.7.4