Re: [PATCH 2/3] mm/page_table_check: check entries at pud and pmd levels

From: Wei Xu
Date: Thu Jan 20 2022 - 12:59:30 EST


On Wed, Jan 19, 2022 at 8:25 PM Pasha Tatashin
<pasha.tatashin@xxxxxxxxxx> wrote:
>
> syzbot detected a case where the page table counters were not properly
> updated.
>
> syzkaller login: ------------[ cut here ]------------
> kernel BUG at mm/page_table_check.c:162!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4
> RIP: 0010:__page_table_check_zero+0x159/0x1a0
> Code: 7d 3a b2 ff 45 39 f5 74 2a e8 43 38 b2 ff 4d 85 e4 01
> RSP: 0018:ffff888010667418 EFLAGS: 00010293
> RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000
> RDX: ffff88800cea8680 RSI: ffffffff81becaf9 RDI: 0000000003
> RBP: ffff888010667450 R08: 0000000000000001 R09: 0000000000
> R10: ffffffff81becaab R11: 0000000000000001 R12: ffff888008
> R13: 0000000000000001 R14: 0000000000000200 R15: dffffc0000
> FS: 0000000000000000(0000) GS:ffff888035e00000(0000) knlG0
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffd875cad00 CR3: 00000000094ce000 CR4: 0000000000
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000
> Call Trace:
> <TASK>
> free_pcp_prepare+0x3be/0xaa0
> free_unref_page+0x1c/0x650
> ? trace_hardirqs_on+0x6a/0x1d0
> free_compound_page+0xec/0x130
> free_transhuge_page+0x1be/0x260
> __put_compound_page+0x90/0xd0
> release_pages+0x54c/0x1060
> ? filemap_remove_folio+0x161/0x210
> ? lock_downgrade+0x720/0x720
> ? __put_page+0x150/0x150
> ? filemap_free_folio+0x164/0x350
> __pagevec_release+0x7c/0x110
> shmem_undo_range+0x85e/0x1250
> ...
>
> The repro involved having a huge page that is split due to uprobe event
> temporarily replacing one of the pages in the huge page. Later the huge
> page was combined again, but the counters were off, as the PTE level
> was not properly updated.
>
> Make sure that not only huge page but also small pages are updated when
> a new entry is set or cleared.
>
> Fixes: df4e817b7108 ("mm: page table check")
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@xxxxxxxxxx>
> ---
> mm/page_table_check.c | 52 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 52 insertions(+)
>
> diff --git a/mm/page_table_check.c b/mm/page_table_check.c
> index 7504e7caa2a1..2341ac382cd5 100644
> --- a/mm/page_table_check.c
> +++ b/mm/page_table_check.c
> @@ -145,6 +145,40 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
> }
> }
>
> +static void pte_clear_level(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep)
> +{
> + unsigned long i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + __page_table_check_pte_clear(mm, addr, *ptep);
> + addr += PAGE_SIZE;
> + ptep++;
> + }
> +}
> +
> +static void pmd_clear_level(struct mm_struct *mm, unsigned long addr,
> + pmd_t *pmdp)
> +{
> + unsigned long i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd_t old_pmd = *pmdp;
> +
> + if (pmd_user_accessible_page(old_pmd)) {
> + page_table_check_clear(mm, addr, pmd_pfn(old_pmd),
> + PMD_PAGE_SIZE >> PAGE_SHIFT);
> + } else if (!pmd_bad(old_pmd) && !pmd_leaf(old_pmd)) {
> + pte_t *ptep = pte_offset_map(&old_pmd, addr);
> +
> + pte_clear_level(mm, addr, ptep);
> + pte_unmap(ptep);
> + }

You can call __page_table_check_pmd_clear(mm, addr, old_pmd, addr)
instead to share the new code.

>
> + addr += PMD_PAGE_SIZE;
> + pmdp++;
> + }
> +}
> +
> /*
> * page is on free list, or is being allocated, verify that counters are zeroes
> * crash if they are not.
> @@ -186,6 +220,11 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
> if (pmd_user_accessible_page(pmd)) {
> page_table_check_clear(mm, addr, pmd_pfn(pmd),
> PMD_PAGE_SIZE >> PAGE_SHIFT);
> + } else if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
> + pte_t *ptep = pte_offset_map(&pmd, addr);
> +
> + pte_clear_level(mm, addr, ptep);
> + pte_unmap(ptep);
> }
> }
> EXPORT_SYMBOL(__page_table_check_pmd_clear);
> @@ -199,6 +238,10 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
> if (pud_user_accessible_page(pud)) {
> page_table_check_clear(mm, addr, pud_pfn(pud),
> PUD_PAGE_SIZE >> PAGE_SHIFT);
> + } else if (!pud_bad(pud) && !pud_leaf(pud)) {
> + pmd_t *pmdp = pmd_offset(&pud, addr);
> +
> + pmd_clear_level(mm, addr, pmdp);
> }
> }
> EXPORT_SYMBOL(__page_table_check_pud_clear);
> @@ -237,6 +280,11 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
> if (pmd_user_accessible_page(old_pmd)) {
> page_table_check_clear(mm, addr, pmd_pfn(old_pmd),
> PMD_PAGE_SIZE >> PAGE_SHIFT);
> + } else if (!pmd_bad(old_pmd) && !pmd_leaf(old_pmd)) {
> + pte_t *ptep = pte_offset_map(&old_pmd, addr);
> +
> + pte_clear_level(mm, addr, ptep);
> + pte_unmap(ptep);
> }
>

How about replacing the above code with
__page_table_check_pmd_clear(mm, addr, old_pmd)?

> if (pmd_user_accessible_page(pmd)) {
> @@ -259,6 +307,10 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
> if (pud_user_accessible_page(old_pud)) {
> page_table_check_clear(mm, addr, pud_pfn(old_pud),
> PUD_PAGE_SIZE >> PAGE_SHIFT);
> + } else if (!pud_bad(old_pud) && !pud_leaf(old_pud)) {
> + pmd_t *pmdp = pmd_offset(&old_pud, addr);
> +
> + pmd_clear_level(mm, addr, pmdp);
> }

Replacing with __page_table_check_pud_clear(mm, addr, old_pud)?
>
>
>
> if (pud_user_accessible_page(pud)) {
> --
> 2.34.1.703.g22d0c6ccf7-goog
>