Re: scheduling while atomic in z3fold

From: Mike Galbraith
Date: Sun Dec 06 2020 - 04:20:43 EST


On Thu, 2020-12-03 at 14:39 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-12-03 09:18:21 [+0100], Mike Galbraith wrote:
> > On Thu, 2020-12-03 at 03:16 +0100, Mike Galbraith wrote:
> > > On Wed, 2020-12-02 at 23:08 +0100, Sebastian Andrzej Siewior wrote:
> > > Looks like...
> > >
> > > d8f117abb380 z3fold: fix use-after-free when freeing handles
> > >
> > > ...wasn't completely effective...
> >
> > The top two hunks seem to have rendered the thing RT tolerant.
>
> Yes, it appears to. I have no idea if this is a proper fix or not.
> Without your write lock, after a few attempts, KASAN says:
>
> | BUG: KASAN: use-after-free in __pv_queued_spin_lock_slowpath+0x293/0x770
> | Write of size 2 at addr ffff88800e0e10aa by task kworker/u16:3/237

Things that make ya go hmmm...

I started poking at it thinking ok, given write_lock() fixes it, bad
juju must be happening under another read_lock(), so I poisoned the
structure about to be freed by __release_z3fold_page() under lock, and
opened a delay window for bad juju to materialize, but it didn't, so I
just poisoned instead, and sprinkled landmines all over the place.

My landmines are not triggering but the detonator is materializing
inside the structure containing the lock we explode on. Somebody
blasts a recycled z3fold_buddy_slots into ram we were working on?

crash> bt -sx
PID: 11850 TASK: ffff933ee67f2280 CPU: 1 COMMAND: "swapoff"
#0 [ffffa7bf4e7f3900] machine_kexec+0x16e at ffffffff8405f86e
#1 [ffffa7bf4e7f3948] __crash_kexec+0xd2 at ffffffff8414c182
#2 [ffffa7bf4e7f3a08] crash_kexec+0x30 at ffffffff8414d370
#3 [ffffa7bf4e7f3a18] oops_end+0xca at ffffffff8402680a
#4 [ffffa7bf4e7f3a38] no_context+0x14d at ffffffff8406d7ed
#5 [ffffa7bf4e7f3a98] exc_page_fault+0x2b8 at ffffffff848bdb88
#6 [ffffa7bf4e7f3af0] asm_exc_page_fault+0x1e at ffffffff84a00ace
#7 [ffffa7bf4e7f3b78] mark_wakeup_next_waiter+0x51 at ffffffff840ea141
#8 [ffffa7bf4e7f3bd8] rt_mutex_futex_unlock+0x4f at ffffffff848c93ef
#9 [ffffa7bf4e7f3c18] z3fold_zpool_free+0x580 at ffffffffc0f37680 [z3fold]
#10 [ffffa7bf4e7f3c78] zswap_free_entry+0x43 at ffffffff8427c863
#11 [ffffa7bf4e7f3c88] zswap_frontswap_invalidate_page+0x8a at ffffffff8427c96a
#12 [ffffa7bf4e7f3cb0] __frontswap_invalidate_page+0x48 at ffffffff8427c058
#13 [ffffa7bf4e7f3cd8] swapcache_free_entries+0x1ee at ffffffff84276f8e
#14 [ffffa7bf4e7f3d20] free_swap_slot+0x9f at ffffffff8427b93f
#15 [ffffa7bf4e7f3d40] delete_from_swap_cache+0x61 at ffffffff84274651
#16 [ffffa7bf4e7f3d60] try_to_free_swap+0x70 at ffffffff84277550
#17 [ffffa7bf4e7f3d70] unuse_vma+0x55c at ffffffff842786cc
#18 [ffffa7bf4e7f3e90] try_to_unuse+0x139 at ffffffff84278eb9
#19 [ffffa7bf4e7f3ee8] __x64_sys_swapoff+0x1eb at ffffffff842798fb
#20 [ffffa7bf4e7f3f40] do_syscall_64+0x33 at ffffffff848b9ab3
#21 [ffffa7bf4e7f3f50] entry_SYSCALL_64_after_hwframe+0x44 at ffffffff84a0007c
RIP: 00007fdd9ce26d17 RSP: 00007ffe99f98238 RFLAGS: 00000202
RAX: ffffffffffffffda RBX: 00005625219e8b60 RCX: 00007fdd9ce26d17
RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00005625219e8b60
RBP: 0000000000000001 R8: 00007ffe99f982a0 R9: 0000000000000003
R10: 00005625219e8721 R11: 0000000000000202 R12: 0000000000000001
R13: 0000000000000000 R14: 00007ffe99f982a0 R15: 0000000000000000
ORIG_RAX: 00000000000000a8 CS: 0033 SS: 002b
crash> bt -e
PID: 11850 TASK: ffff933ee67f2280 CPU: 1 COMMAND: "swapoff"

KERNEL-MODE EXCEPTION FRAME AT: ffffa7bf4e7f3950
[exception RIP: mark_wakeup_next_waiter+81]
RIP: ffffffff840ea141 RSP: ffffa7bf4e7f3ba0 RFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffffa7bf4e7f3bf0 RDI: 0000000000000001
RBP: ffff933e5a7baba8 R8: ffff933f502c2000 R9: ffff933e5a7babc0
R10: 0000000000000001 R11: ffffffff855938b8 R12: ffffa7bf4e7f3be0
R13: ffffa7bf4e7f3bf0 R14: 0000000000000018 R15: ffff933ee67f2280
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
...
crash> gdb list *z3fold_zpool_free+0x580
0xffffffffc0f37680 is in z3fold_zpool_free (mm/z3fold.c:338).
warning: Source file is more recent than executable.
333
334 /* we are freeing a foreign handle if we are here */
335 zhdr->foreign_handles--;
336 is_free = true;
337 if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
338 write_unlock(&slots->lock);
339 return;
340 }
341 for (i = 0; i <= BUDDY_MASK; i++) {
342 if (slots->slot[i]) {
crash> z3fold_buddy_slots -x ffff933e5a7bab80
struct z3fold_buddy_slots {
slot = {0x0, 0x0, 0x0, 0x0},
pool = 0xffff933e4b6d8e00,
lock = {
rtmutex = {
wait_lock = {
raw_lock = {
{
val = {
counter = 0x1
},
{
locked = 0x1,
pending = 0x0
},
{
locked_pending = 0x1,
tail = 0x0
}
}
}
},
waiters = {
rb_root = {
rb_node = 0xffff933e5a7ba700
},
rb_leftmost = 0x0 <== yup, that's why we've been exploding
},
owner = 0xffff933ee67f2280, <== yup, that's our exploding task
save_state = 0x1
},
readers = {
counter = 0x80000000
}
},
poison = 0xfeedbabedeadbeef <== oh dear, that SHOULD be impossible
}
crash>

326 write_lock(&slots->lock);
327 BUG_ON(READ_ONCE(slots->poison)); <== we got past this landmine..
328 *(unsigned long *)handle = 0;
329 if (zhdr->slots == slots) {
330 write_unlock(&slots->lock);
331 return; /* simple case, nothing else to do */
332 }
333
334 /* we are freeing a foreign handle if we are here */
335 zhdr->foreign_handles--;
336 is_free = true;
337 if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
338 write_unlock(&slots->lock); <== ..and exploded here due to a corrupt lock
AND poison written under read_lock()
has suddenly materialized as well!?!
---
mm/z3fold.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)

--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -44,6 +44,7 @@
#include <linux/zpool.h>
#include <linux/magic.h>
#include <linux/kmemleak.h>
+#include <linux/delay.h>

/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -92,6 +93,7 @@ struct z3fold_buddy_slots {
unsigned long slot[BUDDY_MASK + 1];
unsigned long pool; /* back link + flags */
rwlock_t lock;
+ unsigned long poison;
};
#define HANDLE_FLAG_MASK (0x03)

@@ -220,6 +222,7 @@ static inline struct z3fold_buddy_slots
kmemleak_not_leak(slots);
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
+ slots->poison = 0;
}

return slots;
@@ -267,11 +270,13 @@ static inline struct z3fold_header *__ge
unsigned long addr;

read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
addr = *(unsigned long *)handle;
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
if (lock)
locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
if (locked)
break;
cpu_relax();
@@ -319,6 +324,7 @@ static inline void free_handle(unsigned
zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison)); <== poison was zero here
*(unsigned long *)handle = 0;
if (zhdr->slots == slots) {
write_unlock(&slots->lock);
@@ -338,6 +344,10 @@ static inline void free_handle(unsigned
break;
}
}
+ if (is_free)
+ slots->poison = 0xdeaddeaddeaddead;
+ else
+ BUG_ON(READ_ONCE(slots->poison));
write_unlock(&slots->lock);

if (is_free) {
@@ -475,8 +485,10 @@ static unsigned long __encode_handle(str
h |= (zhdr->last_chunks << BUDDY_SHIFT);

write_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
slots->slot[idx] = h;
write_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
return (unsigned long)&slots->slot[idx];
}

@@ -492,8 +504,10 @@ static unsigned short handle_to_chunks(u
unsigned long addr;

read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
addr = *(unsigned long *)handle;
read_unlock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}

@@ -509,10 +523,12 @@ static enum buddy handle_to_buddy(unsign
unsigned long addr;

read_lock(&slots->lock);
+ BUG_ON(READ_ONCE(slots->poison));
WARN_ON(handle & (1 << PAGE_HEADLESS));
addr = *(unsigned long *)handle;
read_unlock(&slots->lock);
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ BUG_ON(READ_ONCE(slots->poison));
return (addr - zhdr->first_num) & BUDDY_MASK;
}

@@ -538,6 +554,7 @@ static void __release_z3fold_page(struct

/* If there are no foreign handles, free the handles array */
read_lock(&zhdr->slots->lock);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
for (i = 0; i <= BUDDY_MASK; i++) {
if (zhdr->slots->slot[i]) {
is_free = false;
@@ -546,6 +563,11 @@ static void __release_z3fold_page(struct
}
if (!is_free)
set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
+ else {
+ zhdr->slots->poison = 0xfeedbabedeadbeef;
+// usleep_range(100, 200);
+// zhdr->slots->poison = 0;
+ }
read_unlock(&zhdr->slots->lock);

if (is_free)
@@ -750,11 +772,14 @@ static struct z3fold_header *compact_sin
new_zhdr->foreign_handles++;
memcpy(q, p, sz);
write_lock(&zhdr->slots->lock);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
*(unsigned long *)old_handle = (unsigned long)new_zhdr +
__idx(new_zhdr, new_bud);
if (new_bud == LAST)
*(unsigned long *)old_handle |=
(new_zhdr->last_chunks << BUDDY_SHIFT);
+ BUG_ON(READ_ONCE(zhdr->slots->poison));
+ BUG_ON(READ_ONCE(new_zhdr->slots->poison));
write_unlock(&zhdr->slots->lock);
add_to_unbuddied(pool, new_zhdr);
z3fold_page_unlock(new_zhdr);