Implement complete_all() with swait

From: Daniel Wagner
Date: Fri Oct 28 2016 - 03:17:23 EST


Hi,

I went through all the users of complete_all() in order to figure out if we can change the completion code using swait instead of wait. The motivation for this is to remove another source of non preemptable unbounded work for -rt.

The complete_all() code uses __wake_up_locked(..., 0) which wakes all waiters. Naturally, complete_all() would use swake_up_all() when using the swait API.

One of the main differences is how swake_up_all() is implemented compared to wake API. swake_up_all() toggles with irq enable/disable to introduce additional preemption points.


/*
* Does not allow usage from IRQ disabled, since we must be able to
* release IRQs to guarantee bounded hold time.
*/
void swake_up_all(struct swait_queue_head *q)
{
struct swait_queue *curr;
LIST_HEAD(tmp);

if (!swait_active(q))
return;

raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
curr = list_first_entry(&tmp, typeof(*curr), task_list);

wake_up_state(curr->task, TASK_NORMAL);
list_del_init(&curr->task_list);

if (list_empty(&tmp))
break;

raw_spin_unlock_irq(&q->lock);
raw_spin_lock_irq(&q->lock);
}
raw_spin_unlock_irq(&q->lock);
}

In the end that means we would weaken the garantees complete_all() current gives you. Currently you can use it in any context (mainline). After the swait switch, you can only use it in non IRQ context anymore.


So I went through the list of users and tried to identify which of them is going to make troubles. I found 4 users which are using complete_all() while IRQs are disabled. The rest looks like it just would work nice. I already fixed up a bunch of drivers which use complete_all() just to make really sure the single waiter is woken up. This list does only contain proper complete_all() users.

Just posting my notes so we have something to discuss next week.

cheers,
daniel


* complete_all() under spin_lock


** drivers/block/drbd/drbd_main.c:

The spinlock serializes the access to thi->t_state and complete_all()
is called while irq are disabled.

This seems to be a state machine.

_drbd_thread_stop()
spin_lock_irqsave()
if (thi->t_state == ...) { }
thi->t_state = ...
smp_mb()
init_completion()
thi->t_state = ...
spin_unlock_irqrestore()
wait_for_completion()

drbd_thread_setup()
spin_lock_irqsave(
if (thi->t_state == ...) { }
thi->t_state = ...
smp_mb()
complete_all()
spin_unlock_irqrestore()

992d6e91d365 ("drbd: fix thread stop deadlock")


** drivers/memstick/core/mspro_block.c:

msb->mrq_handler = h_mspro_block_default
memstick_init_req()
memstick_new_req()

jmb38x_ms_isr() # shared irq isr -> thread irq context
spin_lock()
jmb38x_ms_complete_cmd()
memstick_next_req()
host->card->next_request() # h_mspro_block_default()
h_mspro_block_default()
mspro_block_complete_req()
spin_lock_irqsave()
complete_all()
spin_unlock_irqrestore()

r592.c:
kthread(..., "r592_io")
memstick_next_req()
...

rtsx_pci_ms.c:
workqueue
rtsx_pci_ms_handle_req()
memstick_next_req()
...

rtsx_usb_ms.c:
workqueue
rtsc_usb_ms_handle_req()
memstick_next_req()
...

tifm_ms.c:
tifm_ms_req_tasklet
spin_lock_irqsave()
memstick_next_req()
spin_unlock_irqrestore()


f1d82698029b ("memstick: use fully asynchronous request processing")


** drivers/misc/tifm_7xx1.c:

tifm_7xx1_isr() # shared_irq -> thread context
spin_lock()
complete_all()
spin_unlock()

3540af8ffddc ("tifm: replace per-adapter kthread with freezeable workqueue")


** drivers/mmc/core/core.c:

request_threaded_irq(sdhci_irq)

sdhci_irq()
spin_lock()
sdhci_cmd_irq()
sdhci_finish_command()
mmc_command_done()
mmc_complete_cmd()
complete_all()
spin_unlock()

5163af5a5e2e ("mmc: core: Add support for sending commands during data transfer")


** fs/autofs4/expire.c:

autofs4_root_ioctl_unlock()
autofs4_expire_run()
complete_all()

autofs4_root_ioctl_unlock()
autofs4_expire_mutli()
autofs4_do_expire_multi()
spin_lock()
complete_all()
spin_unlock()

6e60a9ab5f5d ("autofs4: fix direct mount pending expire race")


* misc


** drivers/block/amiflop.c:

timer
motor_on_callback()
complete_all()

6d0be946e150 ("m68k: amiflop - Get rid of sleep_on calls")


** drivers/block/cciss.c:

remove_from_scan_list()
mutex_lock()
complete_all()
mutex_unlock()

b368c9dd6598 ("cciss: Use one scan thread per controller and fix hang during rmmod")


** drivers/block/rbd.c:

net/ceph/osd_client.c:
mutex_lock()
complete_request()
__complete_request() -> rbd_od_req_callback
mutext_unlock()


rbd_osd_req_callback()
rbd_obj_request_complete()
complete_all()

788e2df3b92e ("rbd: implement sync object read with new code")


** drivers/gpu/drm/drm_atomic_helper.c:

documantion indicates all happen under wwmutex

https://lwn.net/Articles/653466/

drm_atomic_helper_update_plane()
drm_atomic_commit()
->atomic_commit()

imx-drm-core.c
imx_drm_atomic_commit() # ->atomic_commit
drm_atomic_helper_commit()
drm_atomic_helper_setup_commit()
complete_all()


a095caa7f5ec ("drm/atomic-helper: roll out commit synchronization")


** drivers/gpu/drm/drm_fops.c:

see above

3b24f7d67581 ("drm/atomic: Add struct drm_crtc_commit to track async updates")


** drivers/infiniband/core/umem_odp.c:

ib_umem_notifier_invalidate_page
down_read()
invalidate_page_trampoline()
ib_umem_notifier_end_account()
complete_all()
up_read()


ib_umem_odp_map_dma_pages()
down_read()
ib_umem_odp_map_dma_single_page()
invalidate_page_trampoline()
ib_umem_notifier_end_account()
complete_all()
up_read()

882214e2b128 ("IB/core: Implement support for MMU notifiers regarding on demand paging regions")


** drivers/misc/mic/scif/scif_nodeqp.c:

scif_get_node_info_resp()
mutex_lock()
complete_all()
mutex_unlock()

fdd9fd5c38af ("misc: mic: SCIF messaging and node enumeration APIs")


** drivers/misc/ti-st/st_kim.c:

kim_int_recv()
validate_firmware_response()
complete_all()

workqueue
mgsl_bh_handler()
get_rx_frame()
msgl_get_raw_rx_frame()
ldisc_receive_buf()
ops->receive_buf()

st_ldisc_ops->receive_buf = st_tty_receivce

st_tty_receive()
st_kim_recv()
kim_int_recv()
kim_check_data_len()
validate_firmware_response()
complete_all()

d0088ce183ad ("Staging: sources for Init manager module")


** drivers/rapidio/rio_cm.c:

riocm_cdev_ioctl()
riocm_cdev_close()
riocm_ch_close()
complete_all()

fiocm_cdev_fops->release = riocm_cdev_release

riocm_cdev_release()
riocm_ch_close()
complete_all


b6e8d4aa1110 ("rapidio: add RapidIO channelized messaging driver")


** fs/btrfs/qgroup.c:

btrfs_qgroup_rescan_worker()
mutex_lock()
mutex_unlock()
complete_all()

57254b6ebce4 ("Btrfs: add ioctl to wait for qgroup rescan completion")


** fs/ceph/file.c:

net/ceph_osd_client.c:handle_reply()
mutex_lock()
mutex_unlock()
req->r_unsafe_callback() # ceph_sync_write_unsafe

ceph_sync_write()
req->r_unsafe_callback = ceph_sync_write_unsafe

ceph_sync_write_unsafe()
complete_all()

fe5da05e9798 ("libceph: redo callbacks and factor out MOSDOpReply decoding")


** fs/ceph/mds_client.c:

cleanup_session_request()
mutex_lock()
__unregister_request()
complete_all()
mutex_unlock()

fc55d2c9448b ("ceph: wake up 'safe' waiters when unregistering request")


** fs/gfs2/ops_fstype.c:

gfs2_mount()
mutex_lock()
mutex_unlock()
fill_super()
init_inodes()
complete_all()

0e48e055a7df ("GFS2: Prevent recovery before the local journal is set")


** net/ceph/mon_client.c:

handle_statfs_reply()
mutex_lock()
mutex_unlock()
complete_generic_request()
complete_all()

d0b19705e999 ("libceph: async MON client generic requests")


** net/ceph/osd_client.c:

map_check_cb()
check_pool_dne()
down_write()
complete_request()
__complete_request()
complete_all()
up_write()

ceph_osdc_handle_map()
down_write()
handle_one_map()
scan_request()
check_pool_dne()
__complete_request()
complete_all()
up_write()

handle_reply()
__complete_request
complete_all()

handle_reply()
mutex_lock()
mutex_unlock()
complete_all()

linger_commit_cb()
mutex_lock()
linger_reg_commit_complete()
complete_all
mutex_unlock()

handle_watch_notify()
mutex_lock()
complete_all()
mutex_unlock()

wait_request_timeout()
wait_for_completion_killable_timeout()
complete_alll()

4609245e2670 ("libceph: pool deletion detection")


** sound/pci/hda/hda_intel.c:

snd_device_free()
__snd_device_free()
dev->ops->dev_free() # azx_dev_free()

avz_create()
static struct snd_device_ops ops = {
.dev_disconnect = azx_dev_disconnect,
.dev_free = azx_dev_free,
}

avx_dev_free()
axz_free()
complete_all()

9a34af4a3327 ("ALSA: hda - Move more PCI-controller-specific stuff from generic code")


* pm


** drivers/base/power/main.c:

kernel/kexec_core.c:kernel_kexec()
mutex_trylock()
dpm_resume_start()
dpm_resume_end()
mutex_lock()

drivers/xen/xenbus.c:xenwatch_thread()
drivers/xen/manage.c:shutwdown_watch.callback=shutdown_handler
shutdown_handler()
do_suspend()
dpm_resume_start()

arch/x86/kernel/apm_32.c:apm_mainloop()
apm_event_handler()
check_events()

standby()
dpm_resume_start()

suspend()
dpm_resume_start()

dpm_resume_start()
dmp_resume_noirq()
async_resume_noirq()
device_resume_noirq()
complete_all()

dmp_resume_noirq()
device_resume_noirq()
complete_all()


kernel/reboot.c
syscall(reboot)
hiberante()

kernel/power/autosleep.c:
try_to_suspend() # running from workqueue context
hibernate()

kernel/power/main.c:
state_store()
hiberante()

kernel/power/hibernate.c:
hibernate()
hiernation_snapshot()
create_image
dpm_resume_start()

152e1d592071 ("PM: Prevent waiting forever on asynchronous resume after failing suspend")


*** calling complete_all() twice
device_register()
device_initialize()
device_pm_init()
device_pm_sleep_init()
init_completion()
complete_all()
device_add()
[...]
goto DevAttrError;
[...]
DevAttrError:
device_pm_remove()
complete_all() # second invocation of complete_all()


** drivers/misc/mic/vop/vop_main.c:

Find out in which context the restore callback is called.

pm_generic_restore()
pm->restore()

pm_ops()
return pm->restore

vrtio_pci_pm_ops->restore = virtio_pci_restore

virio_pci_restore()
virtio_device_restore()
dev->config->reset()

vop_vq_config_ops->reset = vop_reset
vop_reset()
complet_all()


c1becd284968 ("misc: mic: Enable VOP card side functionality")


** drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c:

pm->resume = brcmf_ops_sdio_resume

brcmf_ops_sdio_resume()
brcmd_sdiod_freezer_off()
complete_all()

9982464379e8 ("brcmfmac: make sdio suspend wait for threads to freeze")


* crypto


crypto: larval has multiple waiters

module_init()
crypt_register_alg()
crypto_wait_for_test()
crypto_alg_tested()
complete_all()

kthread(..., "cryptomgr_test")
cryptomgr_test()
crypto_alt_test()
complete_all()


kthread(..., "cryptomgr_probe")
cryptomgr_probe()
complete_all()


** crypto/algapi.c:

fe3c5206adc5 ("[CRYPTO] api: Wake up all waiters when larval completes")


** crypto/algboss.c:

939e17799619 ("crypto: algboss - Hold ref count on larval")


** crypto/api.c:

fe3c5206adc5 ("[CRYPTO] api: Wake up all waiters when larval completes")


* firmware loading


** drivers/input/touchscreen/goodix.c:

request_firmware_nowait(goodix_config_cb)

goodix_config_cb()
complete_all()

68caf85881cd ("Input: goodix - write configuration data to device")


** drivers/remoteproc/remoteproc_core.c:

request_firmware_nowait(rproc_fw_config_virtio)

rproc_fw_config_virtio()
complete_all()

400e64df6b23 ("remoteproc: add framework for controlling remote processors")


** drivers/net/wireless/ath/ath9k/hif_usb.c:

request_firmware_nowait(ath9k_hif_usb_firmware_cb)

ath9k_hif_usb_firmware_cb()
ath9k_hif_usb_firmware_fail()
complete_all()

9494849e53e7 ("ath9k_htc: fix data race between request_firmware_nowait() callback and suspend()")


** drivers/net/wireless/ti/wlcore/main.c:

wl12xx_probe()
wlcore_alloc_hw() # will eventually allocated memroy via kmalloc(GFP_KERNEL)
wlcore_probe()
ret = request_firmware_nowait(wlcore_nvs_cb)
if (ret < 0)
complete_all()

wlcore_nvs_cb()
complete_all()

6f8d6b20bb0b ("wlcore: Load the NVS file asynchronously")


* TODO WTF category


** drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:

Seriously?

vchiq_arm_init_state()
init_completion(&arm_state->resume_blocker);
/* Initialise to 'done' state. We only want to block on this
* completion while resume is blocked */
complete_all(&arm_state->resume_blocker);

init_completion(&arm_state->blocked_blocker);
/* Initialise to 'done' state. We only want to block on this
* completion while things are waiting on the resume blocker */
complete_all(&arm_state->blocked_blocker);

set_suspend_state()
switch (new_state) {
case VC_SUSPEND_FORCE_CANCELED:
complete_all(&arm_state->vc_suspend_complete);
break;
case VC_SUSPEND_REJECTED:
complete_all(&arm_state->vc_suspend_complete);
break;
case VC_SUSPEND_FAILED:
complete_all(&arm_state->vc_suspend_complete);
arm_state->vc_resume_state = VC_RESUME_RESUMED;
complete_all(&arm_state->vc_resume_complete);
break;
case VC_SUSPEND_IDLE:
reinit_completion(&arm_state->vc_suspend_complete);
break;
case VC_SUSPEND_REQUESTED:
break;
case VC_SUSPEND_IN_PROGRESS:
set_resume_state(arm_state, VC_RESUME_IDLE);
break;
case VC_SUSPEND_SUSPENDED:
complete_all(&arm_state->vc_suspend_complete);
break;
default:
BUG();
break;
}

71bad7f08641 ("staging: add bcm2708 vchiq driver")


** drivers/target/target_core_transport.c:

Many more call sites to transport_cmd_check_stop_to_fabric(). I got
sad when tracing them down.

target_complet_tmr_failure(struct work_struct)
transport_cmd_check_stop_to_fabric()

transport_cmd_check_stop_to_fabric()
transport_cmd_check_stop()
complete_all()

a95d6511303b ("target: Use complete_all for se_cmd->t_transport_stop_comp")


* single waiter: patches pending


** drivers/base/firmware_class.c:

[PATCH v6 0/6] firmware: encapsulate firmware loading status
http://marc.info/?l=linux-kernel&m=147695715614730&w=3

https://patchwork.kernel.org/patch/9386483/
https://patchwork.kernel.org/patch/9386485/
https://patchwork.kernel.org/patch/9386481/
https://patchwork.kernel.org/patch/9386471/
https://patchwork.kernel.org/patch/9386475/
https://patchwork.kernel.org/patch/9386473/


** drivers/video/fbdev/omap2/omapfb/dss/apply.c

https://patchwork.kernel.org/patch/9347931/


** drivers/usb/gadget/function/f_fs.c.

https://patchwork.kernel.org/patch/9345325/


** drivers/mfd/mc13xxx-core.c

https://patchwork.kernel.org/patch/9345315/