Re: [ANNOUNCE] v5.14-rc4-rt4

From: Jens Axboe
Date: Wed Aug 04 2021 - 10:24:28 EST


On 8/4/21 7:32 AM, Jens Axboe wrote:
> On 8/4/21 7:17 AM, Peter Zijlstra wrote:
>> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>>> Odd. Do you have a config for that, please?
>>>>
>>>> No need.
>>>> | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>>> | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>>> | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
>>>> | [ 90.202561] Call Trace:
>>> …
>>>> | [ 90.202588] rt_spin_lock+0x19/0x70
>>>> | [ 90.202593] ___slab_alloc+0xcb/0x7d0
>>> …
>>>> | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
>>>> | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
>>>> | [ 90.202625] io_wq_worker_sleeping+0x37/0x50
>>>> | [ 90.202628] schedule+0x30/0xd0
>>>>
>>>> le look.
>>>
>>> So this is due to commit
>>> 685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>>
>>> introduced in the v5.13-rc1 merge window. The call chain is
>>> schedule()
>>> sched_submit_work()
>>> preempt_disable();
>>> io_wq_worker_sleeping()
>>> raw_spin_lock_irq(&worker->wqe->lock);
>>> io_wqe_dec_running(worker);
>>> io_queue_worker_create()
>>> kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>>
>>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>>> 95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>>
>>> after a careful analysis of the code at that time. This commit breaks
>>> things. Is this really needed?
>>
>> Urgh, doing allocs from schedule seems really yuck. Can we please not do
>> this?
>
> Agree, I have an idea of how to get rid of it. Let me experiment a bit...

Something like this should do it - the only thing we need the allocation for
is short lived, queueing a task_work item to create a new worker. We can
manage this on a per-existing worker basis, and just have the tw/index
stored in the worker itself. That avoids an allocation off schedule ->
going to sleep.

Totally untested, but I think the principle is sound. I'll run it through
some testing.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..97eaaf25a429 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {

struct completion ref_done;

+ unsigned long create_state;
+ struct callback_head create_work;
+ int create_index;
+
struct rcu_head rcu;
};

@@ -261,42 +265,44 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running);
}

-struct create_worker_data {
- struct callback_head work;
- struct io_wqe *wqe;
- int index;
-};
-
static void create_worker_cb(struct callback_head *cb)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
struct io_wq *wq;

- cwd = container_of(cb, struct create_worker_data, work);
- wq = cwd->wqe->wq;
- create_io_worker(wq, cwd->wqe, cwd->index);
- kfree(cwd);
+ worker = container_of(cb, struct io_worker, create_work);
+ wq = worker->wqe->wq;
+ create_io_worker(wq, worker->wqe, worker->create_index);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wqe_acct *acct)
{
- struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq;

/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
+ /*
+ * create_state manages ownership of create_work/index. We should
+ * only need one entry per worker, as the worker going to sleep
+ * will trigger the condition, and waking will clear it once it
+ * runs the task_work.
+ */
+ if (test_bit(0, &worker->create_state) ||
+ test_and_set_bit_lock(0, &worker->create_state))
+ goto fail;

- cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
- if (cwd) {
- init_task_work(&cwd->work, create_worker_cb);
- cwd->wqe = wqe;
- cwd->index = acct->index;
- if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
- return;
+ io_worker_get(worker);
+ init_task_work(&worker->create_work, create_worker_cb);
+ worker->create_index = acct->index;
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ return;

- kfree(cwd);
- }
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -314,7 +320,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, acct);
+ io_queue_worker_create(wqe, worker, acct);
}
}

@@ -973,12 +979,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

static bool io_task_work_match(struct callback_head *cb, void *data)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;

if (cb->func != create_worker_cb)
return false;
- cwd = container_of(cb, struct create_worker_data, work);
- return cwd->wqe->wq == data;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker->wqe->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1001,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
return;

while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct create_worker_data *cwd;
+ struct io_worker *worker;

- cwd = container_of(cb, struct create_worker_data, work);
- atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ worker = container_of(cb, struct io_worker, create_work);
+ atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq);
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

rcu_read_lock();

--
Jens Axboe