mm/memory.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 122 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f3eb55975902..49ceddd91db4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -773,7 +773,115 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return 0;
 }
 
-static inline void
+/*
+ * Copy a single small page for fork().
+ *
+ * We have already marked it read-only in the parent if
+ * it's a COW page, and the pte passed in has also been
+ * marked read-only. So the normal thing to do is to
+ * simply increae the page count and the page mapping
+ * count, and the rss, and use the pte as-is. Done.
+ *
+ * However, there is one situation where we can't just
+ * rely on the COW behavior - if the page has been pinned
+ * for DMA in the parent, we can't just give a reference
+ * to it to the child, and say "whoever writes to it will
+ * force a COW". No, the pinned page needs to remain
+ * with the parent, and we need to give the child a copy.
+ *
+ * NOTE! This should never happen. Good pinning users
+ * will either not fork, or will mark the area they pinned
+ * as MADV_DONTFORK so that this situation never comes up.
+ * But if you don't do that...
+ *
+ * Note that if a small page has been pinned, we know the
+ * mapcount for that page should be 1, since the pinning
+ * will have doen the COW at that point. So together with
+ * the elevated refcount, we have very solid heuristics
+ * for "is this page something we need to worry about"
+ */
+static int copy_normal_page(struct vm_area_struct *vma, unsigned long addr,
+		struct mm_struct *src_mm, struct mm_struct *dst_mm,
+		pte_t *src_pte, pte_t *dst_pte,
+		struct page *src_page, int *rss)
+{
+	struct page *dst_page;
+
+	if (likely(!page_maybe_dma_pinned(src_page)))
+		goto reuse_page;
+
+	if (!is_cow_mapping(vma->vm_flags))
+		goto reuse_page;
+
+	if (__page_mapcount(src_page) != 1)
+		goto reuse_page;
+
+	if (!vma->anon_vma || !pte_dirty(*src_pte))
+		goto reuse_page;
+
+	/*
+	 * We have now checked that the page count implies that
+	 * it's pinned, and that it's mapped only in this process,
+	 * and that it's dirty and we have an anonvma (so it's
+	 * an actual write pin, not some read-only one).
+	 *
+	 * That means we have to treat is specially. Nasty.
+	 */
+
+	/*
+	 * Note the wrong 'vma' - source rather than destination.
+	 * It's only used for policy, which is the same.
+	 *
+	 * The bigger issue is that we're holding the ptl lock,
+	 * so this needs to be a non-sleeping allocation.
+	 */
+	dst_page = alloc_page_vma(GFP_ATOMIC | __GFP_HIGH | __GFP_NOWARN, vma, addr);
+	if (!dst_page)
+		return -ENOMEM;
+
+	if (mem_cgroup_charge(dst_page, dst_mm, GFP_ATOMIC)) {
+		put_page(dst_page);
+		return -ENOMEM;
+	}
+	cgroup_throttle_swaprate(dst_page, GFP_ATOMIC);
+	__SetPageUptodate(dst_page);
+
+	copy_user_highpage(dst_page, src_page, addr, vma);
+	*dst_pte = mk_pte(dst_page, vma->vm_page_prot);
+
+	/*
+	 * NOTE! This uses the wrong vma again, but the only thing
+	 * that matters are the vma flags and anon_vma, which are
+	 * the same for source and destination.
+	 */
+	page_add_new_anon_rmap(dst_page, vma, addr, false);
+	lru_cache_add_inactive_or_unevictable(dst_page, vma);
+	rss[mm_counter(dst_page)]++;
+
+	/*
+	 * Final note: make the source writable again. The fact that
+	 * it was unwritable means that we didn't race with any new
+	 * PIN events using fast-GUP, and we've held on to the page
+	 * table lock the whole time so it's safe to just make it
+	 * writable again here.
+	 *
+	 * We might race with hardware walkers, but the dirty bit
+	 * was already set, so no fear of losing a race with a hw
+	 * walker that sets that.
+	 */
+	if (vma->vm_flags & VM_WRITE)
+		*src_pte = pte_mkwrite(*src_pte);
+
+	return 0;
+
+reuse_page:
+	get_page(src_page);
+	page_dup_rmap(src_page, false);
+	rss[mm_counter(src_page)]++;
+	return 0;
+}
+
+static inline int
 copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr, int *rss)
@@ -809,12 +917,15 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	page = vm_normal_page(vma, addr, pte);
 	if (page) {
-		get_page(page);
-		page_dup_rmap(page, false);
-		rss[mm_counter(page)]++;
+		int error;
+
+		error = copy_normal_page(vma, addr, src_mm, dst_mm, src_pte, &pte, page, rss);
+		if (error)
+			return error;
 	}
 
 	set_pte_at(dst_mm, addr, dst_pte, pte);
+	return 0;
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -824,7 +935,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pte_t *orig_src_pte, *orig_dst_pte;
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
-	int progress = 0;
+	int progress = 0, error = 0;
 	int rss[NR_MM_COUNTERS];
 	swp_entry_t entry = (swp_entry_t){0};
 
@@ -865,8 +976,10 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			progress += 8;
 			continue;
 		}
-		copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
+		error = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
 				 vma, addr, rss);
+		if (error)
+			break;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
@@ -877,6 +990,9 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
+	if (error)
+		return error;
+
 	if (entry.val) {
 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
 			return -ENOMEM;