mm/memory.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f3eb55975902..49ceddd91db4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -773,7 +773,115 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static inline void +/* + * Copy a single small page for fork(). + * + * We have already marked it read-only in the parent if + * it's a COW page, and the pte passed in has also been + * marked read-only. So the normal thing to do is to + * simply increae the page count and the page mapping + * count, and the rss, and use the pte as-is. Done. + * + * However, there is one situation where we can't just + * rely on the COW behavior - if the page has been pinned + * for DMA in the parent, we can't just give a reference + * to it to the child, and say "whoever writes to it will + * force a COW". No, the pinned page needs to remain + * with the parent, and we need to give the child a copy. + * + * NOTE! This should never happen. Good pinning users + * will either not fork, or will mark the area they pinned + * as MADV_DONTFORK so that this situation never comes up. + * But if you don't do that... + * + * Note that if a small page has been pinned, we know the + * mapcount for that page should be 1, since the pinning + * will have doen the COW at that point. So together with + * the elevated refcount, we have very solid heuristics + * for "is this page something we need to worry about" + */ +static int copy_normal_page(struct vm_area_struct *vma, unsigned long addr, + struct mm_struct *src_mm, struct mm_struct *dst_mm, + pte_t *src_pte, pte_t *dst_pte, + struct page *src_page, int *rss) +{ + struct page *dst_page; + + if (likely(!page_maybe_dma_pinned(src_page))) + goto reuse_page; + + if (!is_cow_mapping(vma->vm_flags)) + goto reuse_page; + + if (__page_mapcount(src_page) != 1) + goto reuse_page; + + if (!vma->anon_vma || !pte_dirty(*src_pte)) + goto reuse_page; + + /* + * We have now checked that the page count implies that + * it's pinned, and that it's mapped only in this process, + * and that it's dirty and we have an anonvma (so it's + * an actual write pin, not some read-only one). + * + * That means we have to treat is specially. Nasty. + */ + + /* + * Note the wrong 'vma' - source rather than destination. + * It's only used for policy, which is the same. + * + * The bigger issue is that we're holding the ptl lock, + * so this needs to be a non-sleeping allocation. + */ + dst_page = alloc_page_vma(GFP_ATOMIC | __GFP_HIGH | __GFP_NOWARN, vma, addr); + if (!dst_page) + return -ENOMEM; + + if (mem_cgroup_charge(dst_page, dst_mm, GFP_ATOMIC)) { + put_page(dst_page); + return -ENOMEM; + } + cgroup_throttle_swaprate(dst_page, GFP_ATOMIC); + __SetPageUptodate(dst_page); + + copy_user_highpage(dst_page, src_page, addr, vma); + *dst_pte = mk_pte(dst_page, vma->vm_page_prot); + + /* + * NOTE! This uses the wrong vma again, but the only thing + * that matters are the vma flags and anon_vma, which are + * the same for source and destination. + */ + page_add_new_anon_rmap(dst_page, vma, addr, false); + lru_cache_add_inactive_or_unevictable(dst_page, vma); + rss[mm_counter(dst_page)]++; + + /* + * Final note: make the source writable again. The fact that + * it was unwritable means that we didn't race with any new + * PIN events using fast-GUP, and we've held on to the page + * table lock the whole time so it's safe to just make it + * writable again here. + * + * We might race with hardware walkers, but the dirty bit + * was already set, so no fear of losing a race with a hw + * walker that sets that. + */ + if (vma->vm_flags & VM_WRITE) + *src_pte = pte_mkwrite(*src_pte); + + return 0; + +reuse_page: + get_page(src_page); + page_dup_rmap(src_page, false); + rss[mm_counter(src_page)]++; + return 0; +} + +static inline int copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) @@ -809,12 +917,15 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; + int error; + + error = copy_normal_page(vma, addr, src_mm, dst_mm, src_pte, &pte, page, rss); + if (error) + return error; } set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -824,7 +935,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress = 0, error = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; @@ -865,8 +976,10 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, progress += 8; continue; } - copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, + error = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + if (error) + break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -877,6 +990,9 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); + if (error) + return error; + if (entry.val) { if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) return -ENOMEM;