Processes hang in an unkillable state

From: Robert ÅwiÄcki
Date: Tue Apr 12 2011 - 06:32:43 EST


Hi, while fuzzing Linux system calls (32bit fuzzer, 64bi linux
kernel), it happens after some time (10-20mins) that some processes
enter a state which makes them un-killable. They are either in R or D
state.

# strace ps wwuax
...
...
open("/proc/450/cmdline", O_RDONLY) = 6
read(6, - hangs....

# kill -9 450
# kill -9 450 (no ESRCH)

More data in the attachment - I'll keep it in the kdb session for
further examination.

--
Robert ÅwiÄcki
root@ise-test:/proc/450# uname -a
Linux ise-test 2.6.38 #6 SMP PREEMPT Tue Apr 12 11:25:19 CEST 2011 x86_64 GNU/Linux

root@ise-test:/proc/450# cat status
Name: iknowthis
State: D (disk sleep)
Tgid: 450
Pid: 450
PPid: 1
TracerPid: 0
Uid: 1001 1001 1001 1001
Gid: 1001 1001 1001 1001
FDSize: 64
Groups: 1001
VmPeak: 12292 kB
VmSize: 11116 kB
VmLck: 44 kB
VmHWM: 1904 kB
VmRSS: 1324 kB
VmData: 712 kB
VmStk: 7616 kB
VmExe: 160 kB
VmLib: 2556 kB
VmPTE: 28 kB
VmSwap: 0 kB
Threads: 1
SigQ: 785/16382
SigPnd: 0000000000000100 ?? SIGKILL
ShdPnd: 0000000000000100
SigBlk: 0000000000000000
SigIgn: 0000000001001000
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: ffffffffffffffff
Cpus_allowed: ff
Cpus_allowed_list: 0-7
Mems_allowed: 00000000,00000001
Mems_allowed_list: 0
voluntary_ctxt_switches: 2303
nonvoluntary_ctxt_switches: 287

# cat wchan
call_rwsem_down_write_failed

root@ise-test:/proc/450# ls -l fd/
total 0
lrwx------ 1 test test 64 2011-04-12 12:17 0 -> /dev/pts/1
lrwx------ 1 test test 64 2011-04-12 12:17 1 -> /dev/pts/1
lr-x------ 1 test test 64 2011-04-12 12:17 10 -> /dev/shm/??(?x???d?=R1Y ??C??\:?i}?)B????d!0?E?g??? (deleted)
lr-x------ 1 test test 64 2011-04-12 12:17 11 -> /home
l-wx------ 1 test test 64 2011-04-12 12:17 12 -> /var/lock/?=菥+????7???????@N??e训j?P5麓?~?w?PomYK??ny_???ulv?
l-wx------ 1 test test 64 2011-04-12 12:17 13 -> /var/lock/) ?????????`v?-药uQ?????O?臧????Nnhc ??N?I?G???权??????!?{V??ssifs>???N????q?I????!T>??3??z?tI4?B?X"T??? (deleted)
lr-x------ 1 test test 64 2011-04-12 12:17 14 -> /dev/shm/????]?1???B??c?T?eu????X????????????^???~jF???-?e?m?u
l-wx------ 1 test test 64 2011-04-12 12:17 15 -> /var/lock/?绗???q?jAHu??o ???u???????O?;B??QmSX???p???S??w{??,?)???D??V???q?酶#??
lrwx------ 1 test test 64 2011-04-12 12:17 2 -> /dev/pts/1
lr-x------ 1 test test 64 2011-04-12 12:17 3 -> /dev/urandom
lrwx------ 1 test test 64 2011-04-12 12:17 4 -> anon_inode:[timerfd]
lr-x------ 1 test test 64 2011-04-12 12:17 5 -> /sys/kernel/debug/asoc/codecs
lr-x------ 1 test test 64 2011-04-12 12:17 6 -> /proc/i2o
l-wx------ 1 test test 64 2011-04-12 12:17 7 -> /dev/shm/?????z?u????????Z???n"?d?i3???????E??^?????O?}7+?[??#!???&A?R~&??????(??w???\w?c"E?&?"?宓?)??+?-?M_??R?Y??R???6?FAhGu?R?6?k??????l?B?w??~???^??i?????[.? (deleted)
lr-x------ 1 test test 64 2011-04-12 12:17 8 -> /var/lock/??????C???\#]???b-??????q=???,?v?]???????a?i??????\??HTb??c?@\0?M???????????|???x?X?[{&+?$?X???????9?
l-wx------ 1 test test 64 2011-04-12 12:17 9 -> /var/lock/BF}???P(??`?习??.???

root@ise-test:/proc/450# for x in fdinfo/*; do echo == $x ==; cat "$x"; done
== fdinfo/0 ==
pos: 0
flags: 0100002
== fdinfo/1 ==
pos: 0
flags: 0100002
== fdinfo/10 ==
pos: 1351256300595494888
flags: 024240004004
== fdinfo/11 ==
pos: 1084786014
flags: 04040620010
== fdinfo/12 ==
pos: 1811044362
flags: 0100001
== fdinfo/13 ==
pos: 3226073394
flags: 0100001
== fdinfo/14 ==
pos: 803230536
flags: 020026710024
== fdinfo/15 ==
pos: 1103604314
flags: 0100001
== fdinfo/2 ==
pos: 0
flags: 0100002
== fdinfo/3 ==
pos: 0
flags: 00
== fdinfo/4 ==
pos: 0
flags: 02
== fdinfo/5 ==
pos: 7081918111360778253
flags: 020200004044
== fdinfo/6 ==
pos: 2075525754
flags: 0200200000
== fdinfo/7 ==
pos: 2145128454
flags: 0100001
== fdinfo/8 ==
pos: 1343491212
flags: 020471010020
== fdinfo/9 ==
pos: 1073743904
flags: 01100001

(gdb) attach 450
Attaching to process 450
[hangs]

root@ise-test:~# cd /proc/450/
root@ise-test:/proc/450# ls
[hangs, but it worked]

root@ise-test:~# cat /proc/mounts
rootfs / rootfs rw 0 0
/dev/root / ext4 rw,relatime,errors=remount-ro,barrier=1,data=ordered 0 0
devtmpfs /dev devtmpfs rw,relatime,size=1981824k,nr_inodes=495456,mode=755 0 0
none /proc proc rw,nosuid,nodev,noexec,relatime 0 0
none /sys sysfs rw,nosuid,nodev,noexec,relatime 0 0
none /proc/sys/fs/binfmt_misc binfmt_misc rw,nosuid,nodev,noexec,relatime 0 0
none /sys/fs/fuse/connections fusectl rw,relatime 0 0
none /sys/kernel/debug debugfs rw,relatime 0 0
none /sys/kernel/security securityfs rw,relatime 0 0
none /dev/pts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0
none /dev/shm tmpfs rw,nosuid,nodev,relatime 0 0
none /var/run tmpfs rw,nosuid,relatime,mode=755 0 0
none /var/lock tmpfs rw,nosuid,nodev,noexec,relatime 0 0

root@ise-test:~# ls -l /proc/450/root
lrwxrwxrwx 1 test test 0 2011-04-12 12:17 /proc/450/root -> /

root@ise-test:~# cat /proc/450/smaps
[hangs]

root@ise-test:~# cat /proc/450/syscall
7 0xac4 0xff806d88 0x40000000 0x96 0xac5 0xff806da8 0xff806d18 0xffffe430
[0]kdb> bt
Stack traceback for pid 450
0xffff880118905cc0 450 1 0 0 D 0xffff880118906140 iknowthis
<c> ffff88011da8fb58<c> 0000000000000086<c> ffff88012bffbe0a<c> 0000000000000001<c>
<c> ffff880132373331<c> ffff88011da8e000<c> ffff88011da8e000<c> ffff880118905cc0<c>
<c> ffff88011da8e010<c> ffff880118906078<c> ffff88011da8ffd8<c> ffff88011da8ffd8<c>
Call Trace:
[<ffffffff8224f665>] rwsem_down_failed_common+0xc5/0x160
[<ffffffff8224f713>] rwsem_down_write_failed+0x13/0x20
[<ffffffff815956c3>] call_rwsem_down_write_failed+0x13/0x20
[<ffffffff8224e985>] ? down_write+0x25/0x30
[<ffffffff811b0819>] do_coredump+0x139/0xbb0
[<ffffffff810a4e09>] ? sub_preempt_count+0xa9/0xe0
[<ffffffff810cb424>] ? __dequeue_signal+0xf4/0x190
[<ffffffff810cbb4b>] get_signal_to_deliver+0x29b/0x4a0
[<ffffffff8104e365>] do_signal+0x75/0x7b0
[<ffffffff810f8687>] ? compat_sys_wait4+0x97/0xe0
[<ffffffff810b5322>] ? do_fork+0xe2/0x370
[<ffffffff8104eb05>] do_notify_resume+0x65/0x90
[<ffffffff8224f859>] ? trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff8104f378>] int_signal+0x12/0x17

[0]kdb> btc 0
Stack traceback for pid 23592
0xffff880112c04590 23592 1 1 0 R 0xffff880112c04a10 *iknowthis
<c> ffff8800cfc03dc8<c> 0000000000000018<c>
Call Trace:
<#DB> <<EOE>> <IRQ> [<ffffffff816cc6b9>] ? __handle_sysrq+0x129/0x190
[<ffffffff816cc926>] ? handle_sysrq+0x26/0x30
[<ffffffff816e4b8d>] ? serial8250_handle_port+0x21d/0x320
[<ffffffff816e4d2b>] ? serial8250_interrupt+0x9b/0x140
[<ffffffff8111a8b8>] ? handle_IRQ_event+0x68/0x1b0
[<ffffffff8111e7b5>] ? move_native_irq+0x15/0x70
[<ffffffff8111ceae>] ? handle_edge_irq+0xbe/0x170
[<ffffffff81052319>] ? handle_irq+0x49/0xa0
[<ffffffff82250dfd>] ? do_IRQ+0x5d/0xe0
[<ffffffff82250693>] ? ret_from_intr+0x0/0x1a
<EOI> [<ffffffff8117a7b8>] ? do_mlock_pages+0x78/0x150
[<ffffffff8117a7b8>] ? do_mlock_pages+0x78/0x150
[<ffffffff8117b21c>] ? sys_mlock+0x11c/0x130
[<ffffffff8109a653>] ? ia32_sysret+0x0/0x5
[0]kdb> btc 1
Stack traceback for pid 15223
0xffff88011ebb4590 15223 5594 1 1 R 0xffff88011ebb4a10 iknowthis
<c> ffff88011e529c48<c> 0000000000000018<c>
Call Trace:


[0]kdb> summary
sysname Linux
release 2.6.38
version #6 SMP PREEMPT Tue Apr 12 11:25:19 CEST 2011
machine x86_64
nodename ise-test
domainname (none)
ccversion CCVERSION
date 2011-04-12 10:24:42 tz_minuteswest -120
uptime 00:42
load avg 38.13 35.98 28.81

MemTotal: 992186 kB
MemFree: 899924 kB
Buffers: 5411 kB


after go in kdb, and another kdb session:

[0]kdb> btc 0
Stack traceback for pid 15802
0xffff88011e818000 15802 1 1 0 R 0xffff88011e818480 *iknowthis
<c> ffff8800cfc03dc8<c> 0000000000000018<c>
Call Trace:
<#DB> <<EOE>> <IRQ> [<ffffffff816cc6b9>] ? __handle_sysrq+0x129/0x190
[<ffffffff816cc926>] ? handle_sysrq+0x26/0x30
[<ffffffff816e4b8d>] ? serial8250_handle_port+0x21d/0x320
[<ffffffff816e4d2b>] ? serial8250_interrupt+0x9b/0x140
[<ffffffff8111a8b8>] ? handle_IRQ_event+0x68/0x1b0
[<ffffffff8111e7b5>] ? move_native_irq+0x15/0x70
[<ffffffff8111ceae>] ? handle_edge_irq+0xbe/0x170
[<ffffffff81052319>] ? handle_irq+0x49/0xa0
[<ffffffff82250dfd>] ? do_IRQ+0x5d/0xe0
[<ffffffff82250693>] ? ret_from_intr+0x0/0x1a
<EOI> [<ffffffff82250705>] ? restore_args+0x0/0x30
[<ffffffff8117a71d>] ? __mlock_vma_pages_range+0x9d/0xc0
[<ffffffff8117a703>] ? __mlock_vma_pages_range+0x83/0xc0
[<ffffffff8117a7b8>] ? do_mlock_pages+0x78/0x150
[<ffffffff8117b21c>] ? sys_mlock+0x11c/0x130
[<ffffffff8109a653>] ? ia32_sysret+0x0/0x5


and another go/kdb:

[0]kdb> btc 0
Stack traceback for pid 15802
0xffff88011e818000 15802 1 1 0 R 0xffff88011e818480 *iknowthis
<c> ffff8800cfc03dc8<c> 0000000000000018<c>
Call Trace:
<#DB> <<EOE>> <IRQ> [<ffffffff816cc6b9>] ? __handle_sysrq+0x129/0x190
[<ffffffff816cc926>] ? handle_sysrq+0x26/0x30
[<ffffffff816e4b8d>] ? serial8250_handle_port+0x21d/0x320
[<ffffffff816e4d2b>] ? serial8250_interrupt+0x9b/0x140
[<ffffffff8111a8b8>] ? handle_IRQ_event+0x68/0x1b0
[<ffffffff8111e7b5>] ? move_native_irq+0x15/0x70
[<ffffffff8111ceae>] ? handle_edge_irq+0xbe/0x170
[<ffffffff81052319>] ? handle_irq+0x49/0xa0
[<ffffffff82250dfd>] ? do_IRQ+0x5d/0xe0
[<ffffffff82250693>] ? ret_from_intr+0x0/0x1a
<EOI> [<ffffffff8117a7d2>] ? do_mlock_pages+0x92/0x150
[<ffffffff8117a7b8>] ? do_mlock_pages+0x78/0x150
[<ffffffff8117b21c>] ? sys_mlock+0x11c/0x130
[<ffffffff8109a653>] ? ia32_sysret+0x0/0x5