Re: Non-GPL export of invalidate_mmap_range

From: Paul E. McKenney
Date: Fri Feb 20 2004 - 14:47:06 EST


On Fri, Feb 20, 2004 at 12:07:25AM -0500, Daniel Phillips wrote:
> On Thursday 19 February 2004 14:47, Paul E. McKenney wrote:
> > OK, I surrender. I got some private email agreeing with this
> > viewpoint. Any dissenters, speak soon, or...
>
> An implementation is going to look something like the patch below.
> Unfortunately I don't think there is a way around passing an extra parameter
> all the way down the unmap call chain. Doubly unfortunately, this doesn't
> give any benefit at all to anybody who doesn't use a clustered filesystem
> (which is nearly everybody) while there is a marginal cost. Do you know a
> better way? Anyway, this is the price of correct MAP_PRIVATE semantics for
> clustered filesystems. At least I have quantified it so we can decide if it's
> worth it. (My opinion: correctness is always worth it.)

"My work is done!" ;-)

Almost, anyway. A few comments interspersed. This would be in
addition to invalidate_mmap_range-non-gpl-export.patch, right?

I cannot think of any reasonable alternative to passing the parameter
down either, as it certainly does not be reasonable to duplicate the
code...

Thanx, Paul

> Regards,
>
> Daniel
>
> --- 2.6.3.clean/include/linux/mm.h 2004-02-17 22:57:13.000000000 -0500
> +++ 2.6.3/include/linux/mm.h 2004-02-19 23:18:08.000000000 -0500
> @@ -434,9 +434,7 @@
> unsigned long size);
> int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
> struct vm_area_struct *start_vma, unsigned long start_addr,
> - unsigned long end_addr, unsigned long *nr_accounted);
> -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> - unsigned long address, unsigned long size);
> + unsigned long end_addr, unsigned long *nr_accounted, int zap);

How about something like "private_too" instead of "zap"?

(Ah! unmap_page_range() converted to static, since it is used only
in memory.c.)

> void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
> int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
> struct vm_area_struct *vma);
> @@ -444,8 +442,7 @@
> unsigned long size, pgprot_t prot);
>
> extern void invalidate_mmap_range(struct address_space *mapping,
> - loff_t const holebegin,
> - loff_t const holelen);
> + loff_t const holebegin, loff_t const holelen, int zap);
> extern int vmtruncate(struct inode * inode, loff_t offset);
> extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
> extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
> --- 2.6.3.clean/mm/memory.c 2004-02-17 22:57:47.000000000 -0500
> +++ 2.6.3/mm/memory.c 2004-02-19 23:48:23.000000000 -0500
> @@ -386,7 +386,7 @@
>
> static void
> zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd,
> - unsigned long address, unsigned long size)
> + unsigned long address, unsigned long size, int zap)
> {
> unsigned long offset;
> pte_t *ptep;
> @@ -414,7 +414,7 @@
> tlb_remove_tlb_entry(tlb, ptep, address+offset);
> if (pfn_valid(pfn)) {
> struct page *page = pfn_to_page(pfn);
> - if (!PageReserved(page)) {
> + if (!PageReserved(page) && (zap || (page->mapping && !PageSwapCache(page)))) {

Longish line...

> if (pte_dirty(pte))
> set_page_dirty(page);
> if (page->mapping && pte_young(pte) &&
> @@ -436,7 +436,7 @@
>
> static void
> zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir,
> - unsigned long address, unsigned long size)
> + unsigned long address, unsigned long size, int zap)
> {
> pmd_t * pmd;
> unsigned long end;
> @@ -453,14 +453,14 @@
> if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
> end = ((address + PGDIR_SIZE) & PGDIR_MASK);
> do {
> - zap_pte_range(tlb, pmd, address, end - address);
> - address = (address + PMD_SIZE) & PMD_MASK;
> + zap_pte_range(tlb, pmd, address, end - address, zap);
> + address = (address + PMD_SIZE) & PMD_MASK;
> pmd++;
> } while (address < end);
> }
>
> -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> - unsigned long address, unsigned long end)
> +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> + unsigned long address, unsigned long end, int zap)
> {
> pgd_t * dir;
>
> @@ -474,7 +474,7 @@
> dir = pgd_offset(vma->vm_mm, address);
> tlb_start_vma(tlb, vma);
> do {
> - zap_pmd_range(tlb, dir, address, end - address);
> + zap_pmd_range(tlb, dir, address, end - address, zap);
> address = (address + PGDIR_SIZE) & PGDIR_MASK;
> dir++;
> } while (address && (address < end));
> @@ -524,7 +524,7 @@
> */
> int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
> struct vm_area_struct *vma, unsigned long start_addr,
> - unsigned long end_addr, unsigned long *nr_accounted)
> + unsigned long end_addr, unsigned long *nr_accounted, int zap)
> {
> unsigned long zap_bytes = ZAP_BLOCK_SIZE;
> unsigned long tlb_start = 0; /* For tlb_finish_mmu */
> @@ -568,7 +568,7 @@
> tlb_start_valid = 1;
> }
>
> - unmap_page_range(*tlbp, vma, start, start + block);
> + unmap_page_range(*tlbp, vma, start, start + block, zap);
> start += block;
> zap_bytes -= block;
> if ((long)zap_bytes > 0)
> @@ -594,8 +594,8 @@
> * @address: starting address of pages to zap
> * @size: number of bytes to zap
> */
> -void zap_page_range(struct vm_area_struct *vma,
> - unsigned long address, unsigned long size)
> +void invalidate_page_range(struct vm_area_struct *vma,

Would it be useful for this to be inline? (Wouldn't seem so,
zapping mappings has enough overhead that an extra level of
function call should be deep down in the noise...)

> + unsigned long address, unsigned long size, int zap)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct mmu_gather *tlb;
> @@ -612,11 +612,17 @@
> lru_add_drain();
> spin_lock(&mm->page_table_lock);
> tlb = tlb_gather_mmu(mm, 0);
> - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted);
> + unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, zap);
> tlb_finish_mmu(tlb, address, end);
> spin_unlock(&mm->page_table_lock);
> }
>
> +void zap_page_range(struct vm_area_struct *vma,
> + unsigned long address, unsigned long size)
> +{
> + invalidate_page_range(vma, address, size, 1);
> +}
> +
> /*
> * Do a quick page-table lookup for a single page.
> * mm->page_table_lock must be held.
> @@ -1095,9 +1101,9 @@
> continue; /* Mapping disjoint from hole. */
> zba = (hba <= vba) ? vba : hba;
> zea = (vea <= hea) ? vea : hea;
> - zap_page_range(vp,
> + invalidate_page_range(vp,
> ((zba - vba) << PAGE_SHIFT) + vp->vm_start,
> - (zea - zba + 1) << PAGE_SHIFT);
> + (zea - zba + 1) << PAGE_SHIFT, 1);
> }
> }
>
> @@ -1116,7 +1122,7 @@
> * end of the file.
> */
> void invalidate_mmap_range(struct address_space *mapping,
> - loff_t const holebegin, loff_t const holelen)
> + loff_t const holebegin, loff_t const holelen, int zap)
> {
> unsigned long hba = holebegin >> PAGE_SHIFT;
> unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

Doesn't the new argument need to be passed down through
invalidate_mmap_range_list()?

> @@ -1156,7 +1162,7 @@
> if (inode->i_size < offset)
> goto do_expand;
> i_size_write(inode, offset);
> - invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0);
> + invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
> truncate_inode_pages(mapping, offset);
> goto out_truncate;
>
> --- 2.6.3.clean/mm/mmap.c 2004-02-17 22:58:32.000000000 -0500
> +++ 2.6.3/mm/mmap.c 2004-02-19 22:46:01.000000000 -0500
> @@ -1134,7 +1134,7 @@
>
> lru_add_drain();
> tlb = tlb_gather_mmu(mm, 0);
> - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
> + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, 1);
> vm_unacct_memory(nr_accounted);
>
> if (is_hugepage_only_range(start, end - start))
> @@ -1436,7 +1436,7 @@
> flush_cache_mm(mm);
> /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
> mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
> - ~0UL, &nr_accounted);
> + ~0UL, &nr_accounted, 1);
> vm_unacct_memory(nr_accounted);
> BUG_ON(mm->map_count); /* This is just debugging */
> clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
>
>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/