Re: INFO: task hung in blk_queue_enter

From: Jens Axboe
Date: Fri Jun 01 2018 - 20:57:20 EST


On 6/1/18 6:49 PM, Jens Axboe wrote:
> On 6/1/18 5:49 PM, Ming Lei wrote:
>> On Fri, Jun 01, 2018 at 11:52:45AM -0600, Jens Axboe wrote:
>>> On 6/1/18 4:10 AM, Tetsuo Handa wrote:
>>>> Tetsuo Handa wrote:
>>>>> Since sum of percpu_count did not change after percpu_ref_kill(), this is
>>>>> not a race condition while folding percpu counter values into atomic counter
>>>>> value. That is, for some reason, someone who is responsible for calling
>>>>> percpu_ref_put(&q->q_usage_counter) (presumably via blk_queue_exit()) is
>>>>> unable to call percpu_ref_put().
>>>>> But I don't know how to find someone who is failing to call percpu_ref_put()...
>>>>
>>>> I found the someone. It was already there in the backtrace...
>>>>
>>>> ----------------------------------------
>>>> [ 62.065852] a.out D 0 4414 4337 0x00000000
>>>> [ 62.067677] Call Trace:
>>>> [ 62.068545] __schedule+0x40b/0x860
>>>> [ 62.069726] schedule+0x31/0x80
>>>> [ 62.070796] schedule_timeout+0x1c1/0x3c0
>>>> [ 62.072159] ? __next_timer_interrupt+0xd0/0xd0
>>>> [ 62.073670] blk_queue_enter+0x218/0x520
>>>> [ 62.074985] ? remove_wait_queue+0x70/0x70
>>>> [ 62.076361] generic_make_request+0x3d/0x540
>>>> [ 62.077785] ? __bio_clone_fast+0x6b/0x80
>>>> [ 62.079147] ? bio_clone_fast+0x2c/0x70
>>>> [ 62.080456] blk_queue_split+0x29b/0x560
>>>> [ 62.081772] ? blk_queue_split+0x29b/0x560
>>>> [ 62.083162] blk_mq_make_request+0x7c/0x430
>>>> [ 62.084562] generic_make_request+0x276/0x540
>>>> [ 62.086034] submit_bio+0x6e/0x140
>>>> [ 62.087185] ? submit_bio+0x6e/0x140
>>>> [ 62.088384] ? guard_bio_eod+0x9d/0x1d0
>>>> [ 62.089681] do_mpage_readpage+0x328/0x730
>>>> [ 62.091045] ? __add_to_page_cache_locked+0x12e/0x1a0
>>>> [ 62.092726] mpage_readpages+0x120/0x190
>>>> [ 62.094034] ? check_disk_change+0x70/0x70
>>>> [ 62.095454] ? check_disk_change+0x70/0x70
>>>> [ 62.096849] ? alloc_pages_current+0x65/0xd0
>>>> [ 62.098277] blkdev_readpages+0x18/0x20
>>>> [ 62.099568] __do_page_cache_readahead+0x298/0x360
>>>> [ 62.101157] ondemand_readahead+0x1f6/0x490
>>>> [ 62.102546] ? ondemand_readahead+0x1f6/0x490
>>>> [ 62.103995] page_cache_sync_readahead+0x29/0x40
>>>> [ 62.105539] generic_file_read_iter+0x7d0/0x9d0
>>>> [ 62.107067] ? futex_wait+0x221/0x240
>>>> [ 62.108303] ? trace_hardirqs_on+0xd/0x10
>>>> [ 62.109654] blkdev_read_iter+0x30/0x40
>>>> [ 62.110954] generic_file_splice_read+0xc5/0x140
>>>> [ 62.112538] do_splice_to+0x74/0x90
>>>> [ 62.113726] splice_direct_to_actor+0xa4/0x1f0
>>>> [ 62.115209] ? generic_pipe_buf_nosteal+0x10/0x10
>>>> [ 62.116773] do_splice_direct+0x8a/0xb0
>>>> [ 62.118056] do_sendfile+0x1aa/0x390
>>>> [ 62.119255] __x64_sys_sendfile64+0x4e/0xc0
>>>> [ 62.120666] do_syscall_64+0x6e/0x210
>>>> [ 62.121909] entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>> ----------------------------------------
>>>>
>>>> The someone is blk_queue_split() from blk_mq_make_request() who depends on an
>>>> assumption that blk_queue_enter() from recursively called generic_make_request()
>>>> does not get blocked due to percpu_ref_tryget_live(&q->q_usage_counter) failure.
>>>>
>>>> ----------------------------------------
>>>> generic_make_request(struct bio *bio) {
>>>> if (blk_queue_enter(q, flags) < 0) { /* <= percpu_ref_tryget_live() succeeds. */
>>>> if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
>>>> bio_wouldblock_error(bio);
>>>> else
>>>> bio_io_error(bio);
>>>> return ret;
>>>> }
>>>> (...snipped...)
>>>> ret = q->make_request_fn(q, bio);
>>>> (...snipped...)
>>>> if (q)
>>>> blk_queue_exit(q);
>>>> }
>>>> ----------------------------------------
>>>>
>>>> where q->make_request_fn == blk_mq_make_request which does
>>>>
>>>> ----------------------------------------
>>>> blk_mq_make_request(struct request_queue *q, struct bio *bio) {
>>>> blk_queue_split(q, &bio);
>>>> }
>>>>
>>>> blk_queue_split(struct request_queue *q, struct bio **bio) {
>>>> generic_make_request(*bio); /* <= percpu_ref_tryget_live() fails and waits until atomic_read(&q->mq_freeze_depth) becomes 0. */
>>>> }
>>>> ----------------------------------------
>>>>
>>>> and meanwhile atomic_inc_return(&q->mq_freeze_depth) and
>>>> percpu_ref_kill() are called by blk_freeze_queue_start()...
>>>>
>>>> Now, it is up to you about how to fix this race problem.
>>>
>>> Ahh, nicely spotted! One idea would be the one below. For this case,
>>> we're recursing, so we can either do a non-block queue enter, or we
>>> can just do a live enter.
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 85909b431eb0..7de12e0dcc3d 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -2392,7 +2392,9 @@ blk_qc_t generic_make_request(struct bio *bio)
>>>
>>> if (bio->bi_opf & REQ_NOWAIT)
>>> flags = BLK_MQ_REQ_NOWAIT;
>>> - if (blk_queue_enter(q, flags) < 0) {
>>> + if (bio->bi_opf & REQ_QUEUE_ENTERED)
>>> + blk_queue_enter_live(q);
>>> + else if (blk_queue_enter(q, flags) < 0) {
>>> if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
>>> bio_wouldblock_error(bio);
>>> else
>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>> index 782940c65d8a..e1063e8f7eda 100644
>>> --- a/block/blk-merge.c
>>> +++ b/block/blk-merge.c
>>> @@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
>>> /* there isn't chance to merge the splitted bio */
>>> split->bi_opf |= REQ_NOMERGE;
>>>
>>> + /*
>>> + * Since we're recursing into make_request here, ensure
>>> + * that we mark this bio as already having entered the queue.
>>> + * If not, and the queue is going away, we can get stuck
>>> + * forever on waiting for the queue reference to drop. But
>>> + * that will never happen, as we're already holding a
>>> + * reference to it.
>>> + */
>>> + (*bio)->bi_opf |= REQ_QUEUE_ENTERED;
>>> +
>>> bio_chain(split, *bio);
>>> trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
>>> generic_make_request(*bio);
>>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>>> index 17b18b91ebac..a6bc789b83e7 100644
>>> --- a/include/linux/blk_types.h
>>> +++ b/include/linux/blk_types.h
>>> @@ -282,6 +282,8 @@ enum req_flag_bits {
>>> /* command specific flags for REQ_OP_WRITE_ZEROES: */
>>> __REQ_NOUNMAP, /* do not free blocks when zeroing */
>>>
>>> + __REQ_QUEUE_ENTERED, /* queue already entered for this request */
>>> +
>>> /* for driver use */
>>> __REQ_DRV,
>>>
>>> @@ -302,9 +304,8 @@ enum req_flag_bits {
>>> #define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
>>> #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
>>> #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
>>> -
>>> #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
>>> -
>>> +#define REQ_QUEUE_ENTERED (1ULL << __REQ_QUEUE_ENTERED)
>>> #define REQ_DRV (1ULL << __REQ_DRV)
>>>
>>> #define REQ_FAILFAST_MASK \
>>
>> bio clone inherits .bi_opf, so this way may not work for cloned bio.
>>
>> Given most of bio_split() is run in .make_request_fn(), holding the
>> queue ref may not be needed in case of bio splitting, so another
>> candidate fix might be to introduce one extra parameter of
>> 'need_queue_ref' to generic_make_request().
>
> I did consider that, but that's pretty silly for a corner case and you'd
> have to change a crap ton of calls to it. I think it'd be cleaner to
> clear the bit when we need to, potentially even adding a debug check to
> blk_queue_enter_live() that complains if the ref was not already
> elevated. Though that would be expensive, compared to the percpu inc
> now. Not saying the bit is necessarily the best way forward, but I do
> like it a LOT more than adding an argument to generic_make_request.
>
> It should have been a bio flag to begin with, that better captures the
> scope and also avoids clones inheriting anything they shouldn't. It's
> also not applicable to a request.

A little too much inheritance from the previous patch, better version
below.

diff --git a/block/blk-core.c b/block/blk-core.c
index 85909b431eb0..71cb228d7374 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2392,7 +2392,9 @@ blk_qc_t generic_make_request(struct bio *bio)

if (bio->bi_opf & REQ_NOWAIT)
flags = BLK_MQ_REQ_NOWAIT;
- if (blk_queue_enter(q, flags) < 0) {
+ if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+ blk_queue_enter_live(q);
+ else if (blk_queue_enter(q, flags) < 0) {
if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782940c65d8a..481dc02668f9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;

+ /*
+ * Since we're recursing into make_request here, ensure
+ * that we mark this bio as already having entered the queue.
+ * If not, and the queue is going away, we can get stuck
+ * forever on waiting for the queue reference to drop. But
+ * that will never happen, as we're already holding a
+ * reference to it.
+ */
+ bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..1602bf4ab4cd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -186,6 +186,8 @@ struct bio {
* throttling rules. Don't do it again. */
#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
* of this bio. */
+#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */
+
/* See BVEC_POOL_OFFSET below before adding new flags */

/*

--
Jens Axboe