Re: [PATCH v2] bio: limit bio max size

From: Changheun Lee
Date: Thu Jan 21 2021 - 06:41:03 EST


> On Thu, 2021-01-21 at 18:36 +0900, Changheun Lee wrote:
> > > Please drop the "." at the end of the patch title.
> > >
> > > > bio size can grow up to 4GB when muli-page bvec is enabled.
> > > > but sometimes it would lead to inefficient behaviors.
> > > > in case of large chunk direct I/O, - 32MB chunk read in user space -
> > > > all pages for 32MB would be merged to a bio structure if memory address
> > > > is
> > > > continued phsycally. it makes some delay to submit until merge complete.
> > >
> > > s/if memory address is continued phsycally/if the pages physical addresses
> > > are
> > > contiguous/
> > >
> > > > bio max size should be limited as a proper size.
> > >
> > > s/as/to/
> >
> > Thank you for advice. :)
> >
> > >
> > > >
> > > > When 32MB chunk read with direct I/O option is coming from userspace,
> > > > kernel behavior is below now. it's timeline.
> > > >
> > > > | bio merge for 32MB. total 8,192 pages are merged.
> > > > | total elapsed time is over 2ms.
> > > > |------------------ ... ----------------------->|
> > > > | 8,192 pages merged a
> > > > bio.
> > > > | at this time, first
> > > > bio submit is done.
> > > > | 1 bio is split to 32
> > > > read request and issue.
> > > > |--------------->
> > > > |--------------->
> > > > |--------------->
> > > > ......
> > > > |-----
> > > > ---------->
> > > > |----
> > > > ----------->|
> > > > total 19ms elapsed to complete 32MB read done
> > > > from device. |
> > > >
> > > > If bio max size is limited with 1MB, behavior is changed below.
> > > >
> > > > | bio merge for 1MB. 256 pages are merged for each bio.
> > > > | total 32 bio will be made.
> > > > | total elapsed time is over 2ms. it's same.
> > > > | but, first bio submit timing is fast. about 100us.
> > > > |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> > > > | 256 pages merged a bio.
> > > > | at this time, first bio submit is done.
> > > > | and 1 read request is issued for 1 bio.
> > > > |--------------->
> > > > |--------------->
> > > > |--------------->
> > > > ......
> > > > |--------------->
> > > > |--------------->|
> > > > total 17ms elapsed to complete 32MB read done from device. |
> > > >
> > > > As a result, read request issue timing is faster if bio max size is
> > > > limited.
> > > > Current kernel behavior with multipage bvec, super large bio can be
> > > > created.
> > > > And it lead to delay first I/O request issue.
> > > >
> > > > Signed-off-by: Changheun Lee <nanich.lee@xxxxxxxxxxx>
> > > >
> > > > ---
> > > > block/bio.c | 17 ++++++++++++++++-
> > > > include/linux/bio.h | 13 +++----------
> > > > include/linux/blk_types.h | 1 +
> > > > 3 files changed, 20 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/block/bio.c b/block/bio.c
> > > > index 1f2cc1fbe283..027503c2e2e7 100644
> > > > --- a/block/bio.c
> > > > +++ b/block/bio.c
> > > > @@ -284,9 +284,24 @@ void bio_init(struct bio *bio, struct bio_vec
> > > > *table,
> > > >
> > > > bio->bi_io_vec = table;
> > > > bio->bi_max_vecs = max_vecs;
> > > > + bio->bi_max_size = UINT_MAX;
> > > > }
> > > > EXPORT_SYMBOL(bio_init);
> > > >
> > > > +void bio_set_dev(struct bio *bio, struct block_device *bdev)
> > > > +{
> > > > + if (bio->bi_disk != bdev->bd_disk)
> > > > + bio_clear_flag(bio, BIO_THROTTLED);
> > > > +
> > > > + bio->bi_disk = bdev->bd_disk;
> > > > + bio->bi_partno = bdev->bd_partno;
> > > > + bio->bi_max_size = blk_queue_get_max_sectors(bio->bi_disk-
> > > > >queue,
> > > > + bio_op(bio)) << SECTOR_SHIFT;
> > > > +
> > > > + bio_associate_blkg(bio);
> > > > +}
> > > > +EXPORT_SYMBOL(bio_set_dev);
> > > > +
> > > > /**
> > > > * bio_reset - reinitialize a bio
> > > > * @bio: bio to reset
> > > > @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct
> > > > page *page,
> > > > struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> > > >
> > > > if (page_is_mergeable(bv, page, len, off, same_page)) {
> > > > - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> > > > + if (bio->bi_iter.bi_size > bio->bi_max_size -
> > > > len)
> > > > *same_page = false;
> > > > return false;
> > > > }
> > > > diff --git a/include/linux/bio.h b/include/linux/bio.h
> > > > index 1edda614f7ce..b9803e80c259 100644
> > > > --- a/include/linux/bio.h
> > > > +++ b/include/linux/bio.h
> > > > @@ -113,7 +113,7 @@ static inline bool bio_full(struct bio *bio, unsigned
> > > > len)
> > > > if (bio->bi_vcnt >= bio->bi_max_vecs)
> > > > return true;
> > > >
> > > > - if (bio->bi_iter.bi_size > UINT_MAX - len)
> > > > + if (bio->bi_iter.bi_size > bio->bi_max_size - len)
> > > > return true;
> > > >
> > > > return false;
> > > > @@ -482,20 +482,13 @@ extern struct bio_vec *bvec_alloc(gfp_t, int,
> > > > unsigned long *, mempool_t *);
> > > > extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
> > > > extern unsigned int bvec_nr_vecs(unsigned short idx);
> > > > extern const char *bio_devname(struct bio *bio, char *buffer);
> > > > -
> > > > -#define bio_set_dev(bio, bdev) \
> > > > -do { \
> > > > - if ((bio)->bi_disk != (bdev)->bd_disk) \
> > > > - bio_clear_flag(bio, BIO_THROTTLED);\
> > > > - (bio)->bi_disk = (bdev)->bd_disk; \
> > > > - (bio)->bi_partno = (bdev)->bd_partno; \
> > > > - bio_associate_blkg(bio); \
> > > > -} while (0)
> > > > +extern void bio_set_dev(struct bio *bio, struct block_device *bdev);
> > > >
> > > > #define bio_copy_dev(dst, src) \
> > > > do { \
> > > > (dst)->bi_disk = (src)->bi_disk; \
> > > > (dst)->bi_partno = (src)->bi_partno; \
> > > > + (dst)->bi_max_size = (src)->bi_max_size;\
> > > > bio_clone_blkg_association(dst, src); \
> > > > } while (0)
> > > >
> > > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > > > index 866f74261b3b..e5dd5b7d8fc1 100644
> > > > --- a/include/linux/blk_types.h
> > > > +++ b/include/linux/blk_types.h
> > > > @@ -270,6 +270,7 @@ struct bio {
> > > > */
> > > >
> > > > unsigned short bi_max_vecs; /* max bvl_vecs we can
> > > > hold */
> > > > + unsigned int bi_max_size; /* max data size we can
> > > > hold */
> > > >
> > > > atomic_t __bi_cnt; /* pin count */
> > >
> > > This modification comes at the cost of increasing the bio structure size to
> > > simply tell the block layer "do not delay BIO splitting"...
> > >
> > > I think there is a much simpler approach. What about:
> > >
> > > 1) Use a request queue flag to indicate "limit BIO size"
> > > 2) modify __bio_try_merge_page() to look at that flag to disallow page
> > > merging
> > > if the bio size exceeds blk_queue_get_max_sectors(), or more ideally a
> > > version
> > > of it that takes into account the bio start sector.
> > > 3) Set the "limit bio size" queue flag in the driver of the device that
> > > benefit
> > > from this change. Eventually, that could also be controlled through sysfs.
> > >
> > > With such change, you will get the same result without having to increase
> > > the
> > > BIO structure size.
> >
> > I have a qustion.
> > Is adding new variable in bio not possible?
>
> It is possible, but since it is a critical kernel resource used a lot, keeping
> it as small as possible for performance reasons is strongly desired. So if
> there is a coding scheme that can avoid increasing struct bio size, it should
> be explored first and discarded only with very good reasons.

I see your point.
I agree with you. it's important thing. :)

>
> > Additional check for every page merge like as below is inefficient I think.
>
> For the general case of devices that do not care about limiting the bio size
> (like now), this will add one boolean evaluation (queue flag test). That's it.
> For your case, sure you now have 2 boolean evals instead of one. But that must
> be put in perspective with the cost of increasing the bio size.
>
> >
> > bool __bio_try_merge_page(struct bio *bio, struct page *page,
> > unsigned int len, unsigned int off, bool *same_page)
> > {
> > ...
> > if (page_is_mergeable(bv, page, len, off, same_page)) {
> > if (bio->bi_iter.bi_size > UINT_MAX - len) {
> > *same_page = false;
> > return false;
> > }
> >
> > + if (blk_queue_limit_bio_max_size(bio) &&
> > + (bio->bi_iter.bi_size >
> > blk_queue_get_bio_max_size(bio) - len)) {
> > + *same_page = false;
> > + return false;
> > + }
> >
> > bv->bv_len += len;
> > bio->bi_iter.bi_size += len;
> > return true;
> > }
> > ...
> > }
> >
> >
> > static inline bool bio_full(struct bio *bio, unsigned len)
> > {
> > ...
> > if (bio->bi_iter.bi_size > UINT_MAX - len)
> > return true;
> >
> > + if (blk_queue_limit_bio_max_size(bio) &&
> > + (bio->bi_iter.bi_size > blk_queue_get_bio_max_size(bio) - len))
> > + return true;
> > ...
> > }
> >
> >
> > Page merge is CPU-bound job as you said.
> > How about below with adding of bi_max_size in bio?
>
> I am not a fan of adding a bio field for using it only in one place.
> This is only my opinion. I will let others comment about this, but personnally
> I would rather do something like this:
>
> #define blk_queue_limit_bio_merge_size(q) \
> test_bit(QUEUE_FLAG_LIMIT_MERGE, &(q)->queue_flags)
>
> static inline unsigned int bio_max_merge_size(struct bio *bio)
> {
> struct request_queue *q = bio->bi_disk->queue;
>
> if (blk_queue_limit_bio_merge_size(q))
> return blk_queue_get_max_sectors(q, bio_op(bio))
> << SECTOR_SHIFT;
> return UINT_MAX;
> }
>
> and use that helper in __bio_try_merge_page(), e.g.:
>
> if (bio->bi_iter.bi_size > bio_max_merge_size(bio) - len) {
> *same_page = false;
> return false;
> }
>
> No need to change the bio struct.
>
> If you measure performance with and without this change on nullblk, you can
> verify if it has any impact for regular devices. And for your use case, that
> should give you the same performance.
>

OK. I'll wait others comment too for a few days.
I'll prepare v3 patch as you like if there are no feedback. :)
v2 patch has a compile error already by my misstyping. :(

> >
> > bool __bio_try_merge_page(struct bio *bio, struct page *page,
> > unsigned int len, unsigned int off, bool *same_page)
> > {
> > ...
> > if (page_is_mergeable(bv, page, len, off, same_page)) {
> > - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> > + if (bio->bi_iter.bi_size > bio->bi_max_size - len) {
> > *same_page = false;
> > return false;
> > }
> >
> > bv->bv_len += len;
> > bio->bi_iter.bi_size += len;
> > return true;
> > }
> > ...
> > }
> >
> >
> > static inline bool bio_full(struct bio *bio, unsigned len)
> > {
> > ...
> > - if (bio->bi_iter.bi_size > UINT_MAX - len)
> > + if (bio->bi_iter.bi_size > bio->bi_max_size - len)
> > return true;
> > ...
> > }
> >
> > +void bio_set_dev(struct bio *bio, struct block_device *bdev)
> > +{
> > + if (bio->bi_disk != bdev->bd_disk)
> > + bio_clear_flag(bio, BIO_THROTTLED);
> > +
> > + bio->bi_disk = bdev->bd_disk;
> > + bio->bi_partno = bdev->bd_partno;
> > + if (blk_queue_limit_bio_max_size(bio))
> > + bio->bi_max_size = blk_queue_get_bio_max_size(bio);
> > +
> > + bio_associate_blkg(bio);
> > +}
> > +EXPORT_SYMBOL(bio_set_dev);
> >
> > > --
> > > Damien Le Moal
> > > Western Digital Research
> >
> > ---
> > Changheun Lee
> > Samsung Electronics
>
> --
> Damien Le Moal
> Western Digital
>

---
Changheun Lee
Samsung Electronics