Re: [PATCH] s390/kfence: fix page fault reporting

From: Baoquan He
Date: Mon Feb 13 2023 - 20:16:01 EST


On 02/13/23 at 07:38pm, Heiko Carstens wrote:
> Baoquan He reported lots of KFENCE reports when /proc/kcore is read,
> e.g. with crash or even simpler with dd:
>
> BUG: KFENCE: invalid read in copy_from_kernel_nofault+0x5e/0x120
> Invalid read at 0x00000000f4f5149f:
> copy_from_kernel_nofault+0x5e/0x120
> read_kcore+0x6b2/0x870
> proc_reg_read+0x9a/0xf0
> vfs_read+0x94/0x270
> ksys_read+0x70/0x100
> __do_syscall+0x1d0/0x200
> system_call+0x82/0xb0
>
> The reason for this is that read_kcore() simply reads memory that might
> have been unmapped by KFENCE with copy_from_kernel_nofault(). Any fault due
> to pages being unmapped by KFENCE would be handled gracefully by the fault
> handler (exception table fixup).
>
> However the s390 fault handler first reports the fault, and only afterwards
> would perform the exception table fixup. Most architectures have this in
> reversed order, which also avoids the false positive KFENCE reports when an
> unmapped page is accessed.
>
> Therefore change the s390 fault handler so it handles exception table
> fixups before KFENCE page faults are reported.
>
> Reported-by: Baoquan He <bhe@xxxxxxxxxx>
> Signed-off-by: Heiko Carstens <hca@xxxxxxxxxxxxx>

Applied this patch on the latest kernel 6.2-rc8 and tested, it fixes the
problem perfectly. Thank a lot. Please feel free to add:

Tested-by: Baoquan He <bhe@xxxxxxxxxx>

> ---
> arch/s390/mm/fault.c | 49 +++++++++++++++++++++++++++++++-------------
> 1 file changed, 35 insertions(+), 14 deletions(-)
>
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index 9649d9382e0a..8e84ed2bb944 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -96,6 +96,20 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
> return KERNEL_FAULT;
> }
>
> +static unsigned long get_fault_address(struct pt_regs *regs)
> +{
> + unsigned long trans_exc_code = regs->int_parm_long;
> +
> + return trans_exc_code & __FAIL_ADDR_MASK;
> +}
> +
> +static bool fault_is_write(struct pt_regs *regs)
> +{
> + unsigned long trans_exc_code = regs->int_parm_long;
> +
> + return (trans_exc_code & store_indication) == 0x400;
> +}
> +
> static int bad_address(void *p)
> {
> unsigned long dummy;
> @@ -228,15 +242,26 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
> (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
> }
>
> -static noinline void do_no_context(struct pt_regs *regs)
> +static noinline void do_no_context(struct pt_regs *regs, vm_fault_t fault)
> {
> + enum fault_type fault_type;
> + unsigned long address;
> + bool is_write;
> +
> if (fixup_exception(regs))
> return;
> + fault_type = get_fault_type(regs);
> + if ((fault_type == KERNEL_FAULT) && (fault == VM_FAULT_BADCONTEXT)) {
> + address = get_fault_address(regs);
> + is_write = fault_is_write(regs);
> + if (kfence_handle_page_fault(address, is_write, regs))
> + return;
> + }
> /*
> * Oops. The kernel tried to access some bad page. We'll have to
> * terminate things with extreme prejudice.
> */
> - if (get_fault_type(regs) == KERNEL_FAULT)
> + if (fault_type == KERNEL_FAULT)
> printk(KERN_ALERT "Unable to handle kernel pointer dereference"
> " in virtual kernel address space\n");
> else
> @@ -255,7 +280,7 @@ static noinline void do_low_address(struct pt_regs *regs)
> die (regs, "Low-address protection");
> }
>
> - do_no_context(regs);
> + do_no_context(regs, VM_FAULT_BADACCESS);
> }
>
> static noinline void do_sigbus(struct pt_regs *regs)
> @@ -286,28 +311,28 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
> fallthrough;
> case VM_FAULT_BADCONTEXT:
> case VM_FAULT_PFAULT:
> - do_no_context(regs);
> + do_no_context(regs, fault);
> break;
> case VM_FAULT_SIGNAL:
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> break;
> default: /* fault & VM_FAULT_ERROR */
> if (fault & VM_FAULT_OOM) {
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> pagefault_out_of_memory();
> } else if (fault & VM_FAULT_SIGSEGV) {
> /* Kernel mode? Handle exceptions or die */
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> do_sigsegv(regs, SEGV_MAPERR);
> } else if (fault & VM_FAULT_SIGBUS) {
> /* Kernel mode? Handle exceptions or die */
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> do_sigbus(regs);
> } else
> @@ -334,7 +359,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> struct mm_struct *mm;
> struct vm_area_struct *vma;
> enum fault_type type;
> - unsigned long trans_exc_code;
> unsigned long address;
> unsigned int flags;
> vm_fault_t fault;
> @@ -351,9 +375,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> return 0;
>
> mm = tsk->mm;
> - trans_exc_code = regs->int_parm_long;
> - address = trans_exc_code & __FAIL_ADDR_MASK;
> - is_write = (trans_exc_code & store_indication) == 0x400;
> + address = get_fault_address(regs);
> + is_write = fault_is_write(regs);
>
> /*
> * Verify that the fault happened in user space, that
> @@ -364,8 +387,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> type = get_fault_type(regs);
> switch (type) {
> case KERNEL_FAULT:
> - if (kfence_handle_page_fault(address, is_write, regs))
> - return 0;
> goto out;
> case USER_FAULT:
> case GMAP_FAULT:
> --
> 2.37.2
>