Re: system hung up when offlining CPUs

From: YASUAKI ISHIMATSU
Date: Mon Oct 16 2017 - 14:59:29 EST


Hi Thomas,

> Can you please apply the patch below on top of Linus tree and retest?
>
> Please send me the outputs I asked you to provide last time in any case
> (success or fail).

The issue still occurs even if I applied your patch to linux 4.14.0-rc4.

---
[ ...] INFO: task setroubleshootd:4972 blocked for more than 120 seconds.
[ ...] Not tainted 4.14.0-rc4.thomas.with.irqdebug+ #6
[ ...] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ ...] setroubleshootd D 0 4972 1 0x00000080
[ ...] Call Trace:
[ ...] __schedule+0x28d/0x890
[ ...] ? release_pages+0x16f/0x3f0
[ ...] schedule+0x36/0x80
[ ...] io_schedule+0x16/0x40
[ ...] wait_on_page_bit+0x107/0x150
[ ...] ? page_cache_tree_insert+0xb0/0xb0
[ ...] truncate_inode_pages_range+0x3dd/0x7d0
[ ...] ? schedule_hrtimeout_range_clock+0xad/0x140
[ ...] ? remove_wait_queue+0x59/0x60
[ ...] ? down_write+0x12/0x40
[ ...] ? unmap_mapping_range+0x75/0x130
[ ...] truncate_pagecache+0x47/0x60
[ ...] truncate_setsize+0x32/0x40
[ ...] xfs_setattr_size+0x100/0x300 [xfs]
[ ...] xfs_vn_setattr_size+0x40/0x90 [xfs]
[ ...] xfs_vn_setattr+0x87/0xa0 [xfs]
[ ...] notify_change+0x266/0x440
[ ...] do_truncate+0x75/0xc0
[ ...] path_openat+0xaba/0x13b0
[ ...] ? mem_cgroup_commit_charge+0x31/0x130
[ ...] do_filp_open+0x91/0x100
[ ...] ? __alloc_fd+0x46/0x170
[ ...] do_sys_open+0x124/0x210
[ ...] SyS_open+0x1e/0x20
[ ...] do_syscall_64+0x67/0x1b0
[ ...] entry_SYSCALL64_slow_path+0x25/0x25
[ ...] RIP: 0033:0x7f275e2365bd
[ ...] RSP: 002b:00007ffe29337da0 EFLAGS: 00000293 ORIG_RAX: 0000000000000002
[ ...] RAX: ffffffffffffffda RBX: 00000000040aea00 RCX: 00007f275e2365bd
[ ...] RDX: 00000000000001b6 RSI: 0000000000000241 RDI: 00000000040ae840
[ ...] RBP: 00007ffe29337e00 R08: 00000000040aea06 R09: 0000000000000240
[ ...] R10: 0000000000000024 R11: 0000000000000293 R12: 00000000040eb660
[ ...] R13: 0000000000000004 R14: 00000000040ae840 R15: 000000000186a0a0
[ ...] sd 0:2:0:0: [sda] tag#0 task abort called for scmd(ffff9b4bf2306160)
[ ...] sd 0:2:0:0: [sda] tag#0 CDB: Write(10) 2a 00 0b 3a 82 a0 00 00 20 00
[ ...] sd 0:2:0:0: task abort: FAILED scmd(ffff9b4bf2306160)
[ ...] sd 0:2:0:0: target reset called for scmd(ffff9b4bf2306160)
[ ...] sd 0:2:0:0: [sda] tag#0 megasas: target reset FAILED!!
[ ...] sd 0:2:0:0: [sda] tag#0 Controller reset is requested due to IO timeout
[ ...] SCSI command pointer: (ffff9b4bf2306160) SCSI host state: 5 SCSI
---

I could not prepare the same environment I reported. So I reproduced
the issue on the following megasas environment.

---
IRQ affinity_list IRQ_TYPE
34 0-1 IR-PCI-MSI 1048576-edge megasas
35 2-3 IR-PCI-MSI 1048577-edge megasas
36 4-5 IR-PCI-MSI 1048578-edge megasas
37 6-7 IR-PCI-MSI 1048579-edge megasas
38 8-9 IR-PCI-MSI 1048580-edge megasas
39 10-11 IR-PCI-MSI 1048581-edge megasas
40 12-13 IR-PCI-MSI 1048582-edge megasas
41 14-15 IR-PCI-MSI 1048583-edge megasas
42 16-17 IR-PCI-MSI 1048584-edge megasas
43 18-19 IR-PCI-MSI 1048585-edge megasas
44 20-21 IR-PCI-MSI 1048586-edge megasas
45 22-23 IR-PCI-MSI 1048587-edge megasas
46 24-25 IR-PCI-MSI 1048588-edge megasas
47 26-27 IR-PCI-MSI 1048589-edge megasas
48 28-29 IR-PCI-MSI 1048590-edge megasas
49 30-31 IR-PCI-MSI 1048591-edge megasas
50 32-33 IR-PCI-MSI 1048592-edge megasas
51 34-35 IR-PCI-MSI 1048593-edge megasas
52 36-37 IR-PCI-MSI 1048594-edge megasas
53 38-39 IR-PCI-MSI 1048595-edge megasas
54 40-41 IR-PCI-MSI 1048596-edge megasas
55 42-43 IR-PCI-MSI 1048597-edge megasas
56 44-45 IR-PCI-MSI 1048598-edge megasas
57 46-47 IR-PCI-MSI 1048599-edge megasas
58 48-49 IR-PCI-MSI 1048600-edge megasas
59 50-51 IR-PCI-MSI 1048601-edge megasas
60 52-53 IR-PCI-MSI 1048602-edge megasas
61 54-55 IR-PCI-MSI 1048603-edge megasas
62 56-57 IR-PCI-MSI 1048604-edge megasas
63 58-59 IR-PCI-MSI 1048605-edge megasas
64 60-61 IR-PCI-MSI 1048606-edge megasas
65 62-63 IR-PCI-MSI 1048607-edge megasas
66 64-65 IR-PCI-MSI 1048608-edge megasas
67 66-67 IR-PCI-MSI 1048609-edge megasas
68 68-69 IR-PCI-MSI 1048610-edge megasas
69 70-71 IR-PCI-MSI 1048611-edge megasas
70 72-73 IR-PCI-MSI 1048612-edge megasas
71 74-75 IR-PCI-MSI 1048613-edge megasas
72 76-77 IR-PCI-MSI 1048614-edge megasas
73 78-79 IR-PCI-MSI 1048615-edge megasas
74 80-81 IR-PCI-MSI 1048616-edge megasas
75 82-83 IR-PCI-MSI 1048617-edge megasas
76 84-85 IR-PCI-MSI 1048618-edge megasas
77 86-87 IR-PCI-MSI 1048619-edge megasas
78 88-89 IR-PCI-MSI 1048620-edge megasas
79 90-91 IR-PCI-MSI 1048621-edge megasas
80 92-93 IR-PCI-MSI 1048622-edge megasas
81 94-95 IR-PCI-MSI 1048623-edge megasas
82 96-97 IR-PCI-MSI 1048624-edge megasas
83 98-99 IR-PCI-MSI 1048625-edge megasas
84 100-101 IR-PCI-MSI 1048626-edge megasas
85 102-103 IR-PCI-MSI 1048627-edge megasas
86 104-105 IR-PCI-MSI 1048628-edge megasas
87 106-107 IR-PCI-MSI 1048629-edge megasas
88 108-109 IR-PCI-MSI 1048630-edge megasas
89 110-111 IR-PCI-MSI 1048631-edge megasas
90 112-113 IR-PCI-MSI 1048632-edge megasas
91 114-115 IR-PCI-MSI 1048633-edge megasas
92 116-117 IR-PCI-MSI 1048634-edge megasas
93 118-119 IR-PCI-MSI 1048635-edge megasas
94 120-121 IR-PCI-MSI 1048636-edge megasas
95 122-123 IR-PCI-MSI 1048637-edge megasas
96 124-125 IR-PCI-MSI 1048638-edge megasas
97 126-127 IR-PCI-MSI 1048639-edge megasas
98 128-129 IR-PCI-MSI 1048640-edge megasas
99 130-131 IR-PCI-MSI 1048641-edge megasas
100 132-133 IR-PCI-MSI 1048642-edge megasas
101 134-135 IR-PCI-MSI 1048643-edge megasas
102 136-137 IR-PCI-MSI 1048644-edge megasas
103 138-139 IR-PCI-MSI 1048645-edge megasas
104 140-141 IR-PCI-MSI 1048646-edge megasas
105 142-143 IR-PCI-MSI 1048647-edge megasas
106 144-145 IR-PCI-MSI 1048648-edge megasas
107 146-147 IR-PCI-MSI 1048649-edge megasas
108 148-149 IR-PCI-MSI 1048650-edge megasas
109 150-151 IR-PCI-MSI 1048651-edge megasas
110 152-153 IR-PCI-MSI 1048652-edge megasas
111 154-155 IR-PCI-MSI 1048653-edge megasas
112 156-157 IR-PCI-MSI 1048654-edge megasas
113 158-159 IR-PCI-MSI 1048655-edge megasas
114 160-161 IR-PCI-MSI 1048656-edge megasas
115 162-163 IR-PCI-MSI 1048657-edge megasas
116 164-165 IR-PCI-MSI 1048658-edge megasas
117 166-167 IR-PCI-MSI 1048659-edge megasas
118 168-169 IR-PCI-MSI 1048660-edge megasas
119 170-171 IR-PCI-MSI 1048661-edge megasas
120 172-173 IR-PCI-MSI 1048662-edge megasas
121 174-175 IR-PCI-MSI 1048663-edge megasas
122 176-177 IR-PCI-MSI 1048664-edge megasas
123 178-179 IR-PCI-MSI 1048665-edge megasas
124 180-181 IR-PCI-MSI 1048666-edge megasas
125 182-183 IR-PCI-MSI 1048667-edge megasas
126 184-185 IR-PCI-MSI 1048668-edge megasas
127 186-187 IR-PCI-MSI 1048669-edge megasas
128 188-189 IR-PCI-MSI 1048670-edge megasas
129 190-191 IR-PCI-MSI 1048671-edge megasas
---

Here are trace log that I offlined CPU 186-191 in descending order.
When I offlined CPU 186, the issue occurred.

---
# tracer: nop
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
systemd-1 [000] d... 0.427765: irq_do_set_affinity: irq: 24 ret 0 mask: 0-23 eff: 0
systemd-1 [029] d... 16.745803: irq_do_set_affinity: irq: 9 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.850146: irq_do_set_affinity: irq: 25 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.856549: irq_do_set_affinity: irq: 26 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.862920: irq_do_set_affinity: irq: 27 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.869300: irq_do_set_affinity: irq: 28 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.875685: irq_do_set_affinity: irq: 29 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 21.897267: irq_do_set_affinity: irq: 30 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 23.983226: irq_do_set_affinity: irq: 31 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 23.998459: irq_do_set_affinity: irq: 32 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 26.095152: irq_do_set_affinity: irq: 33 ret 2 mask: 0-23 eff: 0-5
kworker/0:3-1458 [000] d... 28.497033: irq_do_set_affinity: irq: 16 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 28.715688: irq_do_set_affinity: irq: 8 ret 2 mask: 0-23 eff: 0-5
systemd-1 [120] d... 29.163740: irq_do_set_affinity: irq: 4 ret 2 mask: 0-23 eff: 0-5
kworker/0:1-134 [000] d... 30.625367: irq_do_set_affinity: irq: 34 ret 2 mask: 0-1 eff: 0-1
kworker/0:1-134 [000] d... 30.625400: irq_do_set_affinity: irq: 35 ret 2 mask: 2-3 eff: 2-3
kworker/0:1-134 [000] d... 30.625442: irq_do_set_affinity: irq: 36 ret 2 mask: 4-5 eff: 4-5
kworker/0:1-134 [000] d... 30.625474: irq_do_set_affinity: irq: 37 ret 2 mask: 6-7 eff: 6-7
kworker/0:1-134 [000] d... 30.625513: irq_do_set_affinity: irq: 38 ret 2 mask: 8-9 eff: 8-9
kworker/0:1-134 [000] d... 30.625549: irq_do_set_affinity: irq: 39 ret 2 mask: 10-11 eff: 10-11
kworker/0:1-134 [000] d... 30.625585: irq_do_set_affinity: irq: 40 ret 2 mask: 12-13 eff: 12-13
kworker/0:1-134 [000] d... 30.625621: irq_do_set_affinity: irq: 41 ret 2 mask: 14-15 eff: 14-15
kworker/0:1-134 [000] d... 30.625656: irq_do_set_affinity: irq: 42 ret 2 mask: 16-17 eff: 16-17
kworker/0:1-134 [000] d... 30.625692: irq_do_set_affinity: irq: 43 ret 2 mask: 18-19 eff: 18-19
kworker/0:1-134 [000] d... 30.625732: irq_do_set_affinity: irq: 44 ret 2 mask: 20-21 eff: 20-21
kworker/0:1-134 [000] d... 30.625768: irq_do_set_affinity: irq: 45 ret 2 mask: 22-23 eff: 22-23
kworker/0:1-134 [000] d... 30.625801: irq_do_set_affinity: irq: 46 ret 2 mask: 24-25 eff: 24-25
kworker/0:1-134 [000] d... 30.625818: irq_do_set_affinity: irq: 47 ret 2 mask: 26-27 eff: 26-27
kworker/0:1-134 [000] d... 30.625843: irq_do_set_affinity: irq: 48 ret 2 mask: 28-29 eff: 28-29
kworker/0:1-134 [000] d... 30.625869: irq_do_set_affinity: irq: 49 ret 2 mask: 30-31 eff: 30-31
kworker/0:1-134 [000] d... 30.625897: irq_do_set_affinity: irq: 50 ret 2 mask: 32-33 eff: 32-33
kworker/0:1-134 [000] d... 30.625922: irq_do_set_affinity: irq: 51 ret 2 mask: 34-35 eff: 34-35
kworker/0:1-134 [000] d... 30.625947: irq_do_set_affinity: irq: 52 ret 2 mask: 36-37 eff: 36-37
kworker/0:1-134 [000] d... 30.625969: irq_do_set_affinity: irq: 53 ret 2 mask: 38-39 eff: 38-39
kworker/0:1-134 [000] d... 30.625992: irq_do_set_affinity: irq: 54 ret 2 mask: 40-41 eff: 40-41
kworker/0:1-134 [000] d... 30.626012: irq_do_set_affinity: irq: 55 ret 2 mask: 42-43 eff: 42-43
kworker/0:1-134 [000] d... 30.626032: irq_do_set_affinity: irq: 56 ret 2 mask: 44-45 eff: 44-45
kworker/0:1-134 [000] d... 30.626052: irq_do_set_affinity: irq: 57 ret 2 mask: 46-47 eff: 46-47
kworker/0:1-134 [000] d... 30.626088: irq_do_set_affinity: irq: 58 ret 2 mask: 48-49 eff: 48-49
kworker/0:1-134 [000] d... 30.626105: irq_do_set_affinity: irq: 59 ret 2 mask: 50-51 eff: 50-51
kworker/0:1-134 [000] d... 30.626118: irq_do_set_affinity: irq: 60 ret 2 mask: 52-53 eff: 52-53
kworker/0:1-134 [000] d... 30.626157: irq_do_set_affinity: irq: 61 ret 2 mask: 54-55 eff: 54-55
kworker/0:1-134 [000] d... 30.626185: irq_do_set_affinity: irq: 62 ret 2 mask: 56-57 eff: 56-57
kworker/0:1-134 [000] d... 30.626217: irq_do_set_affinity: irq: 63 ret 2 mask: 58-59 eff: 58-59
kworker/0:1-134 [000] d... 30.626243: irq_do_set_affinity: irq: 64 ret 2 mask: 60-61 eff: 60-61
kworker/0:1-134 [000] d... 30.626269: irq_do_set_affinity: irq: 65 ret 2 mask: 62-63 eff: 62-63
kworker/0:1-134 [000] d... 30.626299: irq_do_set_affinity: irq: 66 ret 2 mask: 64-65 eff: 64-65
kworker/0:1-134 [000] d... 30.626322: irq_do_set_affinity: irq: 67 ret 2 mask: 66-67 eff: 66-67
kworker/0:1-134 [000] d... 30.626346: irq_do_set_affinity: irq: 68 ret 2 mask: 68-69 eff: 68-69
kworker/0:1-134 [000] d... 30.626368: irq_do_set_affinity: irq: 69 ret 2 mask: 70-71 eff: 70-71
kworker/0:1-134 [000] d... 30.626390: irq_do_set_affinity: irq: 70 ret 2 mask: 72-73 eff: 72-73
kworker/0:1-134 [000] d... 30.626405: irq_do_set_affinity: irq: 71 ret 2 mask: 74-75 eff: 74-75
kworker/0:1-134 [000] d... 30.626417: irq_do_set_affinity: irq: 72 ret 2 mask: 76-77 eff: 76-77
kworker/0:1-134 [000] d... 30.626455: irq_do_set_affinity: irq: 73 ret 2 mask: 78-79 eff: 78-79
kworker/0:1-134 [000] d... 30.626483: irq_do_set_affinity: irq: 74 ret 2 mask: 80-81 eff: 80-81
kworker/0:1-134 [000] d... 30.626510: irq_do_set_affinity: irq: 75 ret 2 mask: 82-83 eff: 82-83
kworker/0:1-134 [000] d... 30.626535: irq_do_set_affinity: irq: 76 ret 2 mask: 84-85 eff: 84-85
kworker/0:1-134 [000] d... 30.626563: irq_do_set_affinity: irq: 77 ret 2 mask: 86-87 eff: 86-87
kworker/0:1-134 [000] d... 30.626585: irq_do_set_affinity: irq: 78 ret 2 mask: 88-89 eff: 88-89
kworker/0:1-134 [000] d... 30.626604: irq_do_set_affinity: irq: 79 ret 2 mask: 90-91 eff: 90-91
kworker/0:1-134 [000] d... 30.626624: irq_do_set_affinity: irq: 80 ret 2 mask: 92-93 eff: 92-93
kworker/0:1-134 [000] d... 30.626644: irq_do_set_affinity: irq: 81 ret 2 mask: 94-95 eff: 94-95
kworker/0:1-134 [000] d... 30.626665: irq_do_set_affinity: irq: 82 ret 2 mask: 96-97 eff: 96-97
kworker/0:1-134 [000] d... 30.626679: irq_do_set_affinity: irq: 83 ret 2 mask: 98-99 eff: 98-99
kworker/0:1-134 [000] d... 30.626693: irq_do_set_affinity: irq: 84 ret 2 mask: 100-101 eff: 100-101
kworker/0:1-134 [000] d... 30.626708: irq_do_set_affinity: irq: 85 ret 2 mask: 102-103 eff: 102-103
kworker/0:1-134 [000] d... 30.626750: irq_do_set_affinity: irq: 86 ret 2 mask: 104-105 eff: 104-105
kworker/0:1-134 [000] d... 30.626784: irq_do_set_affinity: irq: 87 ret 2 mask: 106-107 eff: 106-107
kworker/0:1-134 [000] d... 30.626814: irq_do_set_affinity: irq: 88 ret 2 mask: 108-109 eff: 108-109
kworker/0:1-134 [000] d... 30.626844: irq_do_set_affinity: irq: 89 ret 2 mask: 110-111 eff: 110-111
kworker/0:1-134 [000] d... 30.626872: irq_do_set_affinity: irq: 90 ret 2 mask: 112-113 eff: 112-113
kworker/0:1-134 [000] d... 30.626896: irq_do_set_affinity: irq: 91 ret 2 mask: 114-115 eff: 114-115
kworker/0:1-134 [000] d... 30.626928: irq_do_set_affinity: irq: 92 ret 2 mask: 116-117 eff: 116-117
kworker/0:1-134 [000] d... 30.626954: irq_do_set_affinity: irq: 93 ret 2 mask: 118-119 eff: 118-119
kworker/0:1-134 [000] d... 30.626975: irq_do_set_affinity: irq: 94 ret 2 mask: 120-121 eff: 120-121
kworker/0:1-134 [000] d... 30.626996: irq_do_set_affinity: irq: 95 ret 2 mask: 122-123 eff: 122-123
kworker/0:1-134 [000] d... 30.627022: irq_do_set_affinity: irq: 96 ret 2 mask: 124-125 eff: 124-125
kworker/0:1-134 [000] d... 30.627050: irq_do_set_affinity: irq: 97 ret 2 mask: 126-127 eff: 126-127
kworker/0:1-134 [000] d... 30.627081: irq_do_set_affinity: irq: 98 ret 2 mask: 128-129 eff: 128-129
kworker/0:1-134 [000] d... 30.627110: irq_do_set_affinity: irq: 99 ret 2 mask: 130-131 eff: 130-131
kworker/0:1-134 [000] d... 30.627137: irq_do_set_affinity: irq: 100 ret 2 mask: 132-133 eff: 132-133
kworker/0:1-134 [000] d... 30.627164: irq_do_set_affinity: irq: 101 ret 2 mask: 134-135 eff: 134-135
kworker/0:1-134 [000] d... 30.627191: irq_do_set_affinity: irq: 102 ret 2 mask: 136-137 eff: 136-137
kworker/0:1-134 [000] d... 30.627214: irq_do_set_affinity: irq: 103 ret 2 mask: 138-139 eff: 138-139
kworker/0:1-134 [000] d... 30.627238: irq_do_set_affinity: irq: 104 ret 2 mask: 140-141 eff: 140-141
kworker/0:1-134 [000] d... 30.627263: irq_do_set_affinity: irq: 105 ret 2 mask: 142-143 eff: 142-143
kworker/0:1-134 [000] d... 30.627283: irq_do_set_affinity: irq: 106 ret 2 mask: 144-145 eff: 144-145
kworker/0:1-134 [000] d... 30.627296: irq_do_set_affinity: irq: 107 ret 2 mask: 146-147 eff: 146-147
kworker/0:1-134 [000] d... 30.627311: irq_do_set_affinity: irq: 108 ret 2 mask: 148-149 eff: 148-149
kworker/0:1-134 [000] d... 30.627344: irq_do_set_affinity: irq: 109 ret 2 mask: 150-151 eff: 150-151
kworker/0:1-134 [000] d... 30.627377: irq_do_set_affinity: irq: 110 ret 2 mask: 152-153 eff: 152-153
kworker/0:1-134 [000] d... 30.627410: irq_do_set_affinity: irq: 111 ret 2 mask: 154-155 eff: 154-155
kworker/0:1-134 [000] d... 30.627437: irq_do_set_affinity: irq: 112 ret 2 mask: 156-157 eff: 156-157
kworker/0:1-134 [000] d... 30.627467: irq_do_set_affinity: irq: 113 ret 2 mask: 158-159 eff: 158-159
kworker/0:1-134 [000] d... 30.627494: irq_do_set_affinity: irq: 114 ret 2 mask: 160-161 eff: 160-161
kworker/0:1-134 [000] d... 30.627519: irq_do_set_affinity: irq: 115 ret 2 mask: 162-163 eff: 162-163
kworker/0:1-134 [000] d... 30.627545: irq_do_set_affinity: irq: 116 ret 2 mask: 164-165 eff: 164-165
kworker/0:1-134 [000] d... 30.627569: irq_do_set_affinity: irq: 117 ret 2 mask: 166-167 eff: 166-167
kworker/0:1-134 [000] d... 30.627589: irq_do_set_affinity: irq: 118 ret 2 mask: 168-169 eff: 168-169
kworker/0:1-134 [000] d... 30.627607: irq_do_set_affinity: irq: 119 ret 2 mask: 170-171 eff: 170-171
kworker/0:1-134 [000] d... 30.627639: irq_do_set_affinity: irq: 120 ret 2 mask: 172-173 eff: 172-173
kworker/0:1-134 [000] d... 30.627666: irq_do_set_affinity: irq: 121 ret 2 mask: 174-175 eff: 174-175
kworker/0:1-134 [000] d... 30.627691: irq_do_set_affinity: irq: 122 ret 2 mask: 176-177 eff: 176-177
kworker/0:1-134 [000] d... 30.627721: irq_do_set_affinity: irq: 123 ret 2 mask: 178-179 eff: 178-179
kworker/0:1-134 [000] d... 30.627748: irq_do_set_affinity: irq: 124 ret 2 mask: 180-181 eff: 180-181
kworker/0:1-134 [000] d... 30.627774: irq_do_set_affinity: irq: 125 ret 2 mask: 182-183 eff: 182-183
kworker/0:1-134 [000] d... 30.627799: irq_do_set_affinity: irq: 126 ret 2 mask: 184-185 eff: 184-185
kworker/0:1-134 [000] d... 30.627828: irq_do_set_affinity: irq: 127 ret 2 mask: 186-187 eff: 186
kworker/0:1-134 [000] d... 30.627850: irq_do_set_affinity: irq: 128 ret 2 mask: 188-189 eff: 188
kworker/0:1-134 [000] d... 30.627875: irq_do_set_affinity: irq: 129 ret 2 mask: 190-191 eff: 190
kworker/0:0-3 [000] d... 38.217213: irq_do_set_affinity: irq: 18 ret 2 mask: 0-23 eff: 0-5
systemd-udevd-2007 [129] d... 38.510108: irq_do_set_affinity: irq: 3 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732162: irq_do_set_affinity: irq: 131 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732195: irq_do_set_affinity: irq: 132 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732214: irq_do_set_affinity: irq: 133 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732229: irq_do_set_affinity: irq: 134 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732246: irq_do_set_affinity: irq: 135 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732261: irq_do_set_affinity: irq: 136 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732276: irq_do_set_affinity: irq: 137 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732292: irq_do_set_affinity: irq: 138 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.732308: irq_do_set_affinity: irq: 139 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865529: irq_do_set_affinity: irq: 140 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865557: irq_do_set_affinity: irq: 141 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865575: irq_do_set_affinity: irq: 142 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865591: irq_do_set_affinity: irq: 143 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865607: irq_do_set_affinity: irq: 144 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865621: irq_do_set_affinity: irq: 145 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865635: irq_do_set_affinity: irq: 146 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865650: irq_do_set_affinity: irq: 147 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 44.865664: irq_do_set_affinity: irq: 148 ret 2 mask: 0-23 eff: 0-5
NetworkManager-2628 [135] d... 45.041598: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 6-9,11,16,18-19,21-23,26-28,31,34-35,38,40-43,47-63
NetworkManager-2628 [135] d... 45.042054: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 6-8,10-11,16,18-19,21-23,26-28,31,34-35,38,40-43,47-63
NetworkManager-2628 [135] d... 45.150285: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 0-5
(agetty)-3134 [049] d... 55.930794: irq_do_set_affinity: irq: 4 ret 2 mask: 0-23 eff: 0-5
<...>-1346 [191] d..1 100.473714: irq_do_set_affinity: irq: 129 ret 2 mask: 190-191 eff: 190
<...>-1346 [191] d..1 100.473722: <stack trace>
=> native_cpu_disable
=> take_cpu_down
=> multi_cpu_stop
=> cpu_stopper_thread
=> smpboot_thread_fn
=> kthread
=> ret_from_fork
<...>-1334 [189] d..1 700.567235: irq_do_set_affinity: irq: 128 ret 2 mask: 188-189 eff: 188
<...>-1334 [189] d..1 700.567243: <stack trace>
=> native_cpu_disable
=> take_cpu_down
=> multi_cpu_stop
=> cpu_stopper_thread
=> smpboot_thread_fn
=> kthread
=> ret_from_fork
<...>-1322 [187] d..1 1300.660985: irq_do_set_affinity: irq: 127 ret 2 mask: 186-187 eff: 186
<...>-1322 [187] d..1 1300.660993: <stack trace>
=> native_cpu_disable
=> take_cpu_down
=> multi_cpu_stop
=> cpu_stopper_thread
=> smpboot_thread_fn
=> kthread
=> ret_from_fork
---

Thanks,
Yasuaki Ishimatsu

On 10/10/2017 12:30 PM, YASUAKI ISHIMATSU wrote:
> Hi Thomas,
>
> Sorry for the late reply.
>
> I'll apply the patches and retest in this week.
> Please wait a while.
>
> Thanks,
> Yasuaki Ishimatsu
>
> On 10/04/2017 05:04 PM, Thomas Gleixner wrote:
>> On Tue, 3 Oct 2017, Thomas Gleixner wrote:
>>> Can you please apply the debug patch below.
>>
>> I found an issue with managed interrupts when the affinity mask of an
>> managed interrupt spawns multiple CPUs. Explanation in the changelog
>> below. I'm not sure that this cures the problems you have, but at least I
>> could prove that it's not doing what it should do. The failure I'm seing is
>> fixed, but I can't test that megasas driver due to -ENOHARDWARE.
>>
>> Can you please apply the patch below on top of Linus tree and retest?
>>
>> Please send me the outputs I asked you to provide last time in any case
>> (success or fail).
>>
>> @block/scsi folks: Can you please run that through your tests as well?
>>
>> Thanks,
>>
>> tglx
>>
>> 8<-----------------------
>> Subject: genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs
>> From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>> Date: Wed, 04 Oct 2017 21:07:38 +0200
>>
>> Managed interrupts can end up in a stale state on CPU hotplug. If the
>> interrupt is not targeting a single CPU, i.e. the affinity mask spawns
>> multiple CPUs then the following can happen:
>>
>> After boot:
>>
>> dstate: 0x01601200
>> IRQD_ACTIVATED
>> IRQD_IRQ_STARTED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_SET
>> IRQD_AFFINITY_MANAGED
>> node: 0
>> affinity: 24-31
>> effectiv: 24
>> pending: 0
>>
>> After offlining CPU 31 - 24
>>
>> dstate: 0x01a31000
>> IRQD_IRQ_DISABLED
>> IRQD_IRQ_MASKED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_SET
>> IRQD_AFFINITY_MANAGED
>> IRQD_MANAGED_SHUTDOWN
>> node: 0
>> affinity: 24-31
>> effectiv: 24
>> pending: 0
>>
>> Now CPU 25 gets onlined again, so it should get the effective interrupt
>> affinity for this interruopt, but due to the x86 interrupt affinity setter
>> restrictions this ends up after restarting the interrupt with:
>>
>> dstate: 0x01601300
>> IRQD_ACTIVATED
>> IRQD_IRQ_STARTED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_SET
>> IRQD_SETAFFINITY_PENDING
>> IRQD_AFFINITY_MANAGED
>> node: 0
>> affinity: 24-31
>> effectiv: 24
>> pending: 24-31
>>
>> So the interrupt is still affine to CPU 24, which was the last CPU to go
>> offline of that affinity set and the move to an online CPU within 24-31,
>> in this case 25, is pending. This mechanism is x86/ia64 specific as those
>> architectures cannot move interrupts from thread context and do this when
>> an interrupt is actually handled. So the move is set to pending.
>>
>> Whats worse is that offlining CPU 25 again results in:
>>
>> dstate: 0x01601300
>> IRQD_ACTIVATED
>> IRQD_IRQ_STARTED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_SET
>> IRQD_SETAFFINITY_PENDING
>> IRQD_AFFINITY_MANAGED
>> node: 0
>> affinity: 24-31
>> effectiv: 24
>> pending: 24-31
>>
>> This means the interrupt has not been shut down, because the outgoing CPU
>> is not in the effective affinity mask, but of course nothing notices that
>> the effective affinity mask is pointing at an offline CPU.
>>
>> In the case of restarting a managed interrupt the move restriction does not
>> apply, so the affinity setting can be made unconditional. This needs to be
>> done _before_ the interrupt is started up as otherwise the condition for
>> moving it from thread context would not longer be fulfilled.
>>
>> With that change applied onlining CPU 25 after offlining 31-24 results in:
>>
>> dstate: 0x01600200
>> IRQD_ACTIVATED
>> IRQD_IRQ_STARTED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_MANAGED
>> node: 0
>> affinity: 24-31
>> effectiv: 25
>> pending:
>>
>> And after offlining CPU 25:
>>
>> dstate: 0x01a30000
>> IRQD_IRQ_DISABLED
>> IRQD_IRQ_MASKED
>> IRQD_SINGLE_TARGET
>> IRQD_AFFINITY_MANAGED
>> IRQD_MANAGED_SHUTDOWN
>> node: 0
>> affinity: 24-31
>> effectiv: 25
>> pending:
>>
>> which is the correct and expected result.
>>
>> To complete that, add some debug code to catch this kind of situation in
>> the cpu offline code and warn about interrupt chips which allow affinity
>> setting and do not update the effective affinity mask if that feature is
>> enabled.
>>
>> Reported-by: YASUAKI ISHIMATSU <yasu.isimatu@xxxxxxxxx>
>> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>>
>> ---
>> kernel/irq/chip.c | 2 +-
>> kernel/irq/cpuhotplug.c | 28 +++++++++++++++++++++++++++-
>> kernel/irq/manage.c | 17 +++++++++++++++++
>> 3 files changed, 45 insertions(+), 2 deletions(-)
>>
>> --- a/kernel/irq/chip.c
>> +++ b/kernel/irq/chip.c
>> @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, b
>> irq_setup_affinity(desc);
>> break;
>> case IRQ_STARTUP_MANAGED:
>> + irq_do_set_affinity(d, aff, false);
>> ret = __irq_startup(desc);
>> - irq_set_affinity_locked(d, aff, false);
>> break;
>> case IRQ_STARTUP_ABORT:
>> return 0;
>> --- a/kernel/irq/cpuhotplug.c
>> +++ b/kernel/irq/cpuhotplug.c
>> @@ -18,8 +18,34 @@
>> static inline bool irq_needs_fixup(struct irq_data *d)
>> {
>> const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
>> + unsigned int cpu = smp_processor_id();
>>
>> - return cpumask_test_cpu(smp_processor_id(), m);
>> +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
>> + /*
>> + * The cpumask_empty() check is a workaround for interrupt chips,
>> + * which do not implement effective affinity, but the architecture has
>> + * enabled the config switch. Use the general affinity mask instead.
>> + */
>> + if (cpumask_empty(m))
>> + m = irq_data_get_affinity_mask(d);
>> +
>> + /*
>> + * Sanity check. If the mask is not empty when excluding the outgoing
>> + * CPU then it must contain at least one online CPU. The outgoing CPU
>> + * has been removed from the online mask already.
>> + */
>> + if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
>> + cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
>> + /*
>> + * If this happens then there was a missed IRQ fixup at some
>> + * point. Warn about it and enforce fixup.
>> + */
>> + pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
>> + cpumask_pr_args(m), d->irq, cpu);
>> + return true;
>> + }
>> +#endif
>> + return cpumask_test_cpu(cpu, m);
>> }
>>
>> static bool migrate_one_irq(struct irq_desc *desc)
>> --- a/kernel/irq/manage.c
>> +++ b/kernel/irq/manage.c
>> @@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_
>> set_bit(IRQTF_AFFINITY, &action->thread_flags);
>> }
>>
>> +static void irq_validate_effective_affinity(struct irq_data *data)
>> +{
>> +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
>> + const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
>> + struct irq_chip *chip = irq_data_get_irq_chip(data);
>> +
>> + if (!cpumask_empty(m))
>> + return;
>> + pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
>> + chip->name, data->irq);
>> +#endif
>> +}
>> +
>> int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
>> bool force)
>> {
>> @@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data
>> struct irq_chip *chip = irq_data_get_irq_chip(data);
>> int ret;
>>
>> + if (!chip || !chip->irq_set_affinity)
>> + return -EINVAL;
>> +
>> ret = chip->irq_set_affinity(data, mask, force);
>> switch (ret) {
>> case IRQ_SET_MASK_OK:
>> case IRQ_SET_MASK_OK_DONE:
>> cpumask_copy(desc->irq_common_data.affinity, mask);
>> case IRQ_SET_MASK_OK_NOCOPY:
>> + irq_validate_effective_affinity(data);
>> irq_set_thread_affinity(desc);
>> ret = 0;
>> }
>>