Re: [PATCH] drm/amdgpu: Count disabled CRTCs in commit tail earlier

From: Andrey Grodzovsky
Date: Fri Jun 22 2018 - 22:42:27 EST




On 06/22/2018 09:03 PM, Andrey Grodzovsky wrote:


On 06/22/2018 02:56 PM, Lyude Paul wrote:
On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote:
On 06/21/2018 04:48 PM, Lyude Paul wrote:
This fixes a regression I accidentally reduced that was picked up by
kasan, where we were checking the CRTC atomic states after DRM's helpers
had already freed them. Example:

==================================================================
BUG: KASAN: use-after-free in
amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7

CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G OÂÂÂÂÂ 4.18.0-
rc1Lyude-Upstream+ #1
Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018
Workqueue: events_unbound commit_work [drm_kms_helper]
Call Trace:
ÂÂ dump_stack+0xc1/0x169
ÂÂ ? dump_stack_print_info.cold.1+0x42/0x42
ÂÂ ? kmsg_dump_rewind_nolock+0xd9/0xd9
ÂÂ ? printk+0x9f/0xc5
ÂÂ ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
ÂÂ print_address_description+0x6c/0x23c
ÂÂ ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
ÂÂ kasan_report.cold.6+0x241/0x2fd
ÂÂ amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
ÂÂ ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu]
ÂÂ ? cpu_load_update_active+0x290/0x290
ÂÂ ? finish_task_switch+0x2bd/0x840
ÂÂ ? __switch_to_asm+0x34/0x70
ÂÂ ? read_word_at_a_time+0xe/0x20
ÂÂ ? strscpy+0x14b/0x460
ÂÂ ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0 [drm_kms_helper]
ÂÂ commit_tail+0x96/0xe0 [drm_kms_helper]
ÂÂ process_one_work+0x88a/0x1360
ÂÂ ? create_worker+0x540/0x540
ÂÂ ? __sched_text_start+0x8/0x8
ÂÂ ? move_queued_task+0x760/0x760
ÂÂ ? call_rcu_sched+0x20/0x20
ÂÂ ? vsnprintf+0xcda/0x1350
ÂÂ ? wait_woken+0x1c0/0x1c0
ÂÂ ? mutex_unlock+0x1d/0x40
ÂÂ ? init_timer_key+0x190/0x230
ÂÂ ? schedule+0xea/0x390
ÂÂ ? __schedule+0x1ea0/0x1ea0
ÂÂ ? need_to_create_worker+0xe4/0x210
ÂÂ ? init_worker_pool+0x700/0x700
ÂÂ ? try_to_del_timer_sync+0xbf/0x110
ÂÂ ? del_timer+0x120/0x120
ÂÂ ? __mutex_lock_slowpath+0x10/0x10
ÂÂ worker_thread+0x196/0x11f0
ÂÂ ? flush_rcu_work+0x50/0x50
ÂÂ ? __switch_to_asm+0x34/0x70
ÂÂ ? __switch_to_asm+0x34/0x70
ÂÂ ? __switch_to_asm+0x40/0x70
ÂÂ ? __switch_to_asm+0x34/0x70
ÂÂ ? __switch_to_asm+0x40/0x70
ÂÂ ? __switch_to_asm+0x34/0x70
ÂÂ ? __switch_to_asm+0x40/0x70
ÂÂ ? __schedule+0x7d6/0x1ea0
ÂÂ ? migrate_swap_stop+0x850/0x880
ÂÂ ? __sched_text_start+0x8/0x8
ÂÂ ? save_stack+0x8c/0xb0
ÂÂ ? kasan_kmalloc+0xbf/0xe0
ÂÂ ? kmem_cache_alloc_trace+0xe4/0x190
ÂÂ ? kthread+0x98/0x390
ÂÂ ? ret_from_fork+0x35/0x40
ÂÂ ? ret_from_fork+0x35/0x40
ÂÂ ? deactivate_slab.isra.67+0x3c4/0x5c0
ÂÂ ? kthread+0x98/0x390
ÂÂ ? kthread+0x98/0x390
ÂÂ ? set_track+0x76/0x120
ÂÂ ? schedule+0xea/0x390
ÂÂ ? __schedule+0x1ea0/0x1ea0
ÂÂ ? wait_woken+0x1c0/0x1c0
ÂÂ ? kasan_unpoison_shadow+0x30/0x40
ÂÂ ? parse_args.cold.15+0x17a/0x17a
ÂÂ ? flush_rcu_work+0x50/0x50
ÂÂ kthread+0x2d4/0x390
ÂÂ ? kthread_create_worker_on_cpu+0xc0/0xc0
ÂÂ ret_from_fork+0x35/0x40

Allocated by task 1124:
ÂÂ kasan_kmalloc+0xbf/0xe0
ÂÂ kmem_cache_alloc_trace+0xe4/0x190
ÂÂ dm_crtc_duplicate_state+0x78/0x130 [amdgpu]
ÂÂ drm_atomic_get_crtc_state+0x147/0x410 [drm]
ÂÂ page_flip_common+0x57/0x230 [drm_kms_helper]
ÂÂ drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper]
ÂÂ drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm]
ÂÂ drm_ioctl_kernel+0x1d4/0x260 [drm]
ÂÂ drm_ioctl+0x433/0x920 [drm]
ÂÂ amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
ÂÂ do_vfs_ioctl+0x1a1/0x13d0
ÂÂ ksys_ioctl+0x60/0x90
ÂÂ __x64_sys_ioctl+0x6f/0xb0
ÂÂ do_syscall_64+0x147/0x440
ÂÂ entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 1124:
ÂÂ __kasan_slab_free+0x12e/0x180
ÂÂ kfree+0x92/0x1a0
ÂÂ drm_atomic_state_default_clear+0x315/0xc40 [drm]
ÂÂ __drm_atomic_state_free+0x35/0xd0 [drm]
ÂÂ drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper]
ÂÂ __setplane_internal+0x2d6/0x840 [drm]
ÂÂ drm_mode_cursor_universal+0x41e/0xbe0 [drm]
ÂÂ drm_mode_cursor_common+0x49f/0x880 [drm]
ÂÂ drm_mode_cursor_ioctl+0xd8/0x130 [drm]
ÂÂ drm_ioctl_kernel+0x1d4/0x260 [drm]
ÂÂ drm_ioctl+0x433/0x920 [drm]
ÂÂ amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
ÂÂ do_vfs_ioctl+0x1a1/0x13d0
ÂÂ ksys_ioctl+0x60/0x90
ÂÂ __x64_sys_ioctl+0x6f/0xb0
ÂÂ do_syscall_64+0x147/0x440
ÂÂ entry_SYSCALL_64_after_hwframe+0x44/0xa9

The buggy address belongs to the object at ffff8803a697b068
ÂÂ which belongs to the cache kmalloc-1024 of size 1024
The buggy address is located 9 bytes inside of
ÂÂ 1024-byte region [ffff8803a697b068, ffff8803a697b468)
The buggy address belongs to the page:
page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0 index:0x0
compound_mapcount: 0
flags: 0x8000000000008100(slab|head)
raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70 ffff88041e00efc0
raw: 0000000000000000 0000000000170017 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
ÂÂ ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ÂÂ ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ ^
ÂÂ ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ÂÂ ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

So, we fix this by counting the number of CRTCs this atomic commit disabled
early on in the function before their atomic states have been freed, then
use
that count later to do the appropriate number of RPM puts at the end of the
function.
I am a bit not clear, are you saying that the problem was the 'in the
middle' commit (cursor ioctl) doing

drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state)

where the state is the one you access from from the non blocking part of
page flip though old_crtc_state->active?
The problem is that (see the comment in drivers/gpu/drm/drm_atomic_helper.c:2065
) it's unsafe to touch any of the old_crtc_state structures after
drm_atomic_helper_commit_hw_done() is called, as it's likely that they've been
freed already.

IÂ am not sure about that, the comment in drm_atomic_helper_commit_hw_done says that
"the driver is not allowed to read or change any permanent software
or hardware modeset state" I interpret it as not the old_crtc_state but as the new_crtc_state or crtc->state after
drm_atomic_helper_swap_state completed. It means that if you touch crtc->state after drm_atomic_helper_commit_hw_done
you actually could already be accessing a state which belong to the next atomic commit after you.
It really looks like cursor's atomic commit sneaks in in a middle of page flip between the page flip IOCTL
and it's commit_tail part and swaps away crct->state to his own new state and release the 'old' state which is not really
old yet and needs to be used by the tail part of page flip. This makes sense since do_aquire_global_lock we use in amdgpu_dm_atomic_check
to serialize against concurrent atomic_commits is not called for case of cursor plane and so it may race against any commit_tail in flight...
Not sure why we haven't seen this problem before.
Obviously your fix makes the problem go away since you stopped accessing the new_crtc_state and not the old_crtc_state but the root problem
seems to me still there.

Andrey

I took another look and actually no problem with the CURSOR IOCTL as it will wait in drm_atomic_helper_swap_state for hw_done event, so
I agree with the fix but just disagree with the explanation, it should be said that it's unsafe to touch the new_crtc_state (same as crtc->state) after
call to drm_atomic_helper_commit_hw_done. So I would make the explanation a bit more detailed on this point.

Anyway, the fix is Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>

Andrey




Andrey
Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in
atomic_commit_tail()")
Signed-off-by: Lyude Paul <lyude@xxxxxxxxxx>
Cc: Michel DÃnzer <michel@xxxxxxxxxxx>
Reported-by: Michel DÃnzer <michel@xxxxxxxxxxx>
---
ÂÂ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++----
ÂÂ 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index f9add85157e7..689dbdf44bbf 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct
drm_atomic_state *state)
ÂÂÂÂÂÂ struct drm_connector *connector;
ÂÂÂÂÂÂ struct drm_connector_state *old_con_state, *new_con_state;
ÂÂÂÂÂÂ struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
+ÂÂÂ int crtc_disable_count = 0;
ÂÂ ÂÂÂÂÂÂ drm_atomic_helper_update_legacy_modeset_state(dev, state);
ÂÂ @@ -4410,6 +4411,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
drm_atomic_state *state)
ÂÂÂÂÂÂÂÂÂÂ struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc);
ÂÂÂÂÂÂÂÂÂÂ bool modeset_needed;
ÂÂ +ÂÂÂÂÂÂÂ if (old_crtc_state->active && !new_crtc_state->active)
+ÂÂÂÂÂÂÂÂÂÂÂ crtc_disable_count++;
+
ÂÂÂÂÂÂÂÂÂÂ dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
ÂÂÂÂÂÂÂÂÂÂ dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
ÂÂÂÂÂÂÂÂÂÂ modeset_needed = modeset_required(
@@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
drm_atomic_state *state)
ÂÂÂÂÂÂÂ * so we can put the GPU into runtime suspend if we're not driving
any
ÂÂÂÂÂÂÂ * displays anymore
ÂÂÂÂÂÂÂ */
+ÂÂÂ for (i = 0; i < crtc_disable_count; i++)
+ÂÂÂÂÂÂÂ pm_runtime_put_autosuspend(dev->dev);
ÂÂÂÂÂÂ pm_runtime_mark_last_busy(dev->dev);
-ÂÂÂ for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
new_crtc_state, i) {
-ÂÂÂÂÂÂÂ if (old_crtc_state->active && !new_crtc_state->active)
-ÂÂÂÂÂÂÂÂÂÂÂ pm_runtime_put_autosuspend(dev->dev);
-ÂÂÂ }
ÂÂ }