RE: [PATCH RFC] fs/ceph : fix build warning in ceph_writepages_start()
From: Viacheslav Dubeyko
Date: Mon Jul 07 2025 - 14:24:44 EST
On Mon, 2025-07-07 at 13:11 +0300, Alex Markuze wrote:
> Hi Abinash,
>
> Thanks for your patch, you’ve correctly identified a real concern
> around stack usage in ceph_writepages_start().
>
> However, dynamically allocating ceph_writeback_ctl inside .writepages
> isn't ideal. This function may be called in memory reclaim paths or
> under writeback pressure, where any GFP allocation (even GFP_NOFS)
> could deadlock or fail unexpectedly.
>
> Instead of allocating the struct on each call, I’d suggest one of the following:
>
> Add a dedicated kmem_cache for ceph_writeback_ctl, initialized during
> Ceph FS client init.
> This allows reuse across calls without hitting the slab allocator each time.
>
I had discussion with Ilya several days ago related to this issue. :) I
considered namely to add the dedicated kmem_cache for ceph_writeback_ctl.
However, Ilya mentioned that CONFIG_FRAME_WARN defaulting to 2048 on x86_64. So,
you even could not see such warning on x86_64 systems. But, yes, this rework
makes sense from my point of view.
By the way, do we need to consider mempool here instead of kmem_cache? The
writeback is pretty intensive and critical code path.
Thanks,
Slava.
> Embed a ceph_writeback_ctl struct inside ceph_inode_info, if it's
> feasible to manage lifetimes and synchronization. Note that
> .writepages can run concurrently on the same inode, so this approach
> would require proper locking or reuse guarantees.
>
> On Sat, Jul 5, 2025 at 6:54 PM Abinash Singh <abinashlalotra@xxxxxxxxx> wrote:
> >
> > The function `ceph_writepages_start()` triggers
> > a large stack frame warning:
> > ld.lld: warning:
> > fs/ceph/addr.c:1632:0: stack frame size (1072) exceeds limit (1024) in function 'ceph_writepages_start.llvm.2555023190050417194'
> >
> > fix it by dynamically allocating `ceph_writeback_ctl` struct.
> >
> > Signed-off-by: Abinash Singh <abinashsinghlalotra@xxxxxxxxx>
> > ---
> > The high stack usage of ceph_writepages_start() was further
> > confirmed by using -fstack-usage flag :
> > ...
> > fs/ceph/addr.c:1837:ceph_netfs_check_write_begin 104 static
> > fs/ceph/addr.c:1630:ceph_writepages_start 648 static
> > ...
> > After optimzations it may go upto 1072 as seen in warning.
> > After allocating `ceph_writeback_ctl` struct it is reduced to:
> > ...
> > fs/ceph/addr.c:1630:ceph_writepages_start 288 static
> > fs/ceph/addr.c:81:ceph_dirty_folio 72 static
> > ...
> > Is this fun used very frequently ?? or dynamic allocation is
> > a fine fix for reducing the stack usage ?
> >
> > Thank You
> > ---
> > fs/ceph/addr.c | 82 ++++++++++++++++++++++++++------------------------
> > 1 file changed, 43 insertions(+), 39 deletions(-)
> >
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 60a621b00c65..503a86c1dda6 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1633,9 +1633,13 @@ static int ceph_writepages_start(struct address_space *mapping,
> > struct inode *inode = mapping->host;
> > struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
> > struct ceph_client *cl = fsc->client;
> > - struct ceph_writeback_ctl ceph_wbc;
> > + struct ceph_writeback_ctl *ceph_wbc __free(kfree) = NULL;
> > int rc = 0;
> >
> > + ceph_wbc = kmalloc(sizeof(*ceph_wbc), GFP_NOFS);
> > + if (!ceph_wbc)
> > + return -ENOMEM;
> > +
> > if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
> > return 0;
> >
> > @@ -1648,7 +1652,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > return -EIO;
> > }
> >
> > - ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
> > + ceph_init_writeback_ctl(mapping, wbc, ceph_wbc);
> >
> > if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
> > rc = -EIO;
> > @@ -1656,7 +1660,7 @@ static int ceph_writepages_start(struct address_space *mapping,
> > }
> >
> > retry:
> > - rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
> > + rc = ceph_define_writeback_range(mapping, wbc, ceph_wbc);
> > if (rc == -ENODATA) {
> > /* hmm, why does writepages get called when there
> > is no dirty data? */
> > @@ -1665,55 +1669,55 @@ static int ceph_writepages_start(struct address_space *mapping,
> > }
> >
> > if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> > - tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
> > + tag_pages_for_writeback(mapping, ceph_wbc->index, ceph_wbc->end);
> >
> > - while (!has_writeback_done(&ceph_wbc)) {
> > - ceph_wbc.locked_pages = 0;
> > - ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
> > + while (!has_writeback_done(ceph_wbc)) {
> > + ceph_wbc->locked_pages = 0;
> > + ceph_wbc->max_pages = ceph_wbc->wsize >> PAGE_SHIFT;
> >
> > get_more_pages:
> > - ceph_folio_batch_reinit(&ceph_wbc);
> > + ceph_folio_batch_reinit(ceph_wbc);
> >
> > - ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
> > - &ceph_wbc.index,
> > - ceph_wbc.end,
> > - ceph_wbc.tag,
> > - &ceph_wbc.fbatch);
> > + ceph_wbc->nr_folios = filemap_get_folios_tag(mapping,
> > + &ceph_wbc->index,
> > + ceph_wbc->end,
> > + ceph_wbc->tag,
> > + &ceph_wbc->fbatch);
> > doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
> > - ceph_wbc.tag, ceph_wbc.nr_folios);
> > + ceph_wbc->tag, ceph_wbc->nr_folios);
> >
> > - if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
> > + if (!ceph_wbc->nr_folios && !ceph_wbc->locked_pages)
> > break;
> >
> > process_folio_batch:
> > - rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
> > + rc = ceph_process_folio_batch(mapping, wbc, ceph_wbc);
> > if (rc)
> > goto release_folios;
> >
> > /* did we get anything? */
> > - if (!ceph_wbc.locked_pages)
> > + if (!ceph_wbc->locked_pages)
> > goto release_folios;
> >
> > - if (ceph_wbc.processed_in_fbatch) {
> > - ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
> > + if (ceph_wbc->processed_in_fbatch) {
> > + ceph_shift_unused_folios_left(&ceph_wbc->fbatch);
> >
> > - if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
> > - ceph_wbc.locked_pages < ceph_wbc.max_pages) {
> > + if (folio_batch_count(&ceph_wbc->fbatch) == 0 &&
> > + ceph_wbc->locked_pages < ceph_wbc->max_pages) {
> > doutc(cl, "reached end fbatch, trying for more\n");
> > goto get_more_pages;
> > }
> > }
> >
> > - rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
> > + rc = ceph_submit_write(mapping, wbc, ceph_wbc);
> > if (rc)
> > goto release_folios;
> >
> > - ceph_wbc.locked_pages = 0;
> > - ceph_wbc.strip_unit_end = 0;
> > + ceph_wbc->locked_pages = 0;
> > + ceph_wbc->strip_unit_end = 0;
> >
> > - if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
> > - ceph_wbc.nr_folios =
> > - folio_batch_count(&ceph_wbc.fbatch);
> > + if (folio_batch_count(&ceph_wbc->fbatch) > 0) {
> > + ceph_wbc->nr_folios =
> > + folio_batch_count(&ceph_wbc->fbatch);
> > goto process_folio_batch;
> > }
> >
> > @@ -1724,38 +1728,38 @@ static int ceph_writepages_start(struct address_space *mapping,
> > * we tagged for writeback prior to entering this loop.
> > */
> > if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
> > - ceph_wbc.done = true;
> > + ceph_wbc->done = true;
> >
> > release_folios:
> > doutc(cl, "folio_batch release on %d folios (%p)\n",
> > - (int)ceph_wbc.fbatch.nr,
> > - ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
> > - folio_batch_release(&ceph_wbc.fbatch);
> > + (int)ceph_wbc->fbatch.nr,
> > + ceph_wbc->fbatch.nr ? ceph_wbc->fbatch.folios[0] : NULL);
> > + folio_batch_release(&ceph_wbc->fbatch);
> > }
> >
> > - if (ceph_wbc.should_loop && !ceph_wbc.done) {
> > + if (ceph_wbc->should_loop && !ceph_wbc->done) {
> > /* more to do; loop back to beginning of file */
> > doutc(cl, "looping back to beginning of file\n");
> > /* OK even when start_index == 0 */
> > - ceph_wbc.end = ceph_wbc.start_index - 1;
> > + ceph_wbc->end = ceph_wbc->start_index - 1;
> >
> > /* to write dirty pages associated with next snapc,
> > * we need to wait until current writes complete */
> > - ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
> > + ceph_wait_until_current_writes_complete(mapping, wbc, ceph_wbc);
> >
> > - ceph_wbc.start_index = 0;
> > - ceph_wbc.index = 0;
> > + ceph_wbc->start_index = 0;
> > + ceph_wbc->index = 0;
> > goto retry;
> > }
> >
> > - if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
> > - mapping->writeback_index = ceph_wbc.index;
> > + if (wbc->range_cyclic || (ceph_wbc->range_whole && wbc->nr_to_write > 0))
> > + mapping->writeback_index = ceph_wbc->index;
> >
> > dec_osd_stopping_blocker:
> > ceph_dec_osd_stopping_blocker(fsc->mdsc);
> >
> > out:
> > - ceph_put_snap_context(ceph_wbc.last_snapc);
> > + ceph_put_snap_context(ceph_wbc->last_snapc);
> > doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
> > rc);
> >
> > --
> > 2.43.0
> >
> >
>