Re: [patch] pipe: add support for shrinking and growing pipes

From: Miklos Szeredi
Date: Thu May 20 2010 - 04:33:40 EST


On Wed, 19 May 2010, Jens Axboe wrote:
> On Wed, May 19 2010, Jens Axboe wrote:
> > On Wed, May 19 2010, Linus Torvalds wrote:
> > >
> > >
> > > On Wed, 19 May 2010, Miklos Szeredi wrote:
> > > >
> > > > One issue I see is that it's possible to grow pipes indefinitely.
> > > > Should this be restricted to privileged users?
> > >
> > > Yes. But perhaps only if it grows past the default (or perhaps "default*2"
> > > or similar). That way a normal user could shrink the pipe buffers, and
> > > then grow them again if he wants to.
> >
> > That's still a bit arbitrary, I don't think allowing default*2 only for
> > non-root is going to be hugely interesting. But limiting makes sense,
> > but lets at least allow a larger max limit for the normal user. I'm
> > suspecting that the media application that wants to use this will not be
> > running as root, and we don't make the feature properly available to the
> > ones that want to use it, then we may as well not do it.
> >
> > Or we could expose a sysctl for instance that holds the max non-root
> > size. And make that default to default*16 or something. How does that
> > sound?
> >
> > > Oh, and I think you need to also require that there be at least two
> > > buffers. Otherwise we can't guarantee POSIX behavior, I think.
> >
> > Good point, and at least that part is easily doable :-)
>
> So I updated the patch, that branch was pretty ancient... The fcntl pipe
> numbers were also screwed up, so got that fixed. New patch is here:
>
> http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=23dcb845246946aeda5a5e398c6911381ad28365

Not sure why you didn't take my updated patch. Yours misses
conversion of kernel/trace/trace.c as well as some hunks below (that
renaming the "pages" and "partial" arrays and a compile test would
have revealed).

Thanks,
Miklos

> From: Jens Axboe <jens.axboe@xxxxxxxxxx>
> Date: Wed, 19 May 2010 18:47:30 +0000 (+0200)
> Subject: pipe: add support for shrinking and growing pipes
> X-Git-Url: http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff_plain;h=23dcb845246946aeda5a5e398c6911381ad28365
>
> pipe: add support for shrinking and growing pipes
>
> This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for
> growing and shrinking the size of a pipe and adjusts pipe.c and splice.c
> (and relay and network splice) usage to work with these larger (or smaller)
> pipes.
>
> Signed-off-by: Jens Axboe <jens.axboe@xxxxxxxxxx>
> ---
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 452d02f..bcba960 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -14,6 +14,7 @@
> #include <linux/dnotify.h>
> #include <linux/slab.h>
> #include <linux/module.h>
> +#include <linux/pipe_fs_i.h>
> #include <linux/security.h>
> #include <linux/ptrace.h>
> #include <linux/signal.h>
> @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
> case F_NOTIFY:
> err = fcntl_dirnotify(fd, filp, arg);
> break;
> + case F_SETPIPE_SZ:
> + case F_GETPIPE_SZ:
> + err = pipe_fcntl(filp, cmd, arg);
> + break;
> default:
> break;
> }
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 37ba29f..054b8a6 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -11,6 +11,7 @@
> #include <linux/module.h>
> #include <linux/init.h>
> #include <linux/fs.h>
> +#include <linux/log2.h>
> #include <linux/mount.h>
> #include <linux/pipe_fs_i.h>
> #include <linux/uio.h>
> @@ -390,7 +391,7 @@ redo:
> if (!buf->len) {
> buf->ops = NULL;
> ops->release(pipe, buf);
> - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
> + curbuf = (curbuf + 1) & (pipe->buffers - 1);
> pipe->curbuf = curbuf;
> pipe->nrbufs = --bufs;
> do_wakeup = 1;
> @@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
> chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
> if (pipe->nrbufs && chars != 0) {
> int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
> - (PIPE_BUFFERS-1);
> + (pipe->buffers - 1);
> struct pipe_buffer *buf = pipe->bufs + lastbuf;
> const struct pipe_buf_operations *ops = buf->ops;
> int offset = buf->offset + buf->len;
> @@ -518,8 +519,8 @@ redo1:
> break;
> }
> bufs = pipe->nrbufs;
> - if (bufs < PIPE_BUFFERS) {
> - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
> + if (bufs < pipe->buffers) {
> + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
> struct pipe_buffer *buf = pipe->bufs + newbuf;
> struct page *page = pipe->tmp_page;
> char *src;
> @@ -580,7 +581,7 @@ redo2:
> if (!total_len)
> break;
> }
> - if (bufs < PIPE_BUFFERS)
> + if (bufs < pipe->buffers)
> continue;
> if (filp->f_flags & O_NONBLOCK) {
> if (!ret)
> @@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> nrbufs = pipe->nrbufs;
> while (--nrbufs >= 0) {
> count += pipe->bufs[buf].len;
> - buf = (buf+1) & (PIPE_BUFFERS-1);
> + buf = (buf+1) & (pipe->buffers - 1);
> }
> mutex_unlock(&inode->i_mutex);
>
> @@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table *wait)
> }
>
> if (filp->f_mode & FMODE_WRITE) {
> - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
> + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
> /*
> * Most Unices do not set POLLERR for FIFOs but on Linux they
> * behave exactly like pipes for poll().
> @@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
>
> pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
> if (pipe) {
> - init_waitqueue_head(&pipe->wait);
> - pipe->r_counter = pipe->w_counter = 1;
> - pipe->inode = inode;
> + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
> + if (pipe->bufs) {
> + init_waitqueue_head(&pipe->wait);
> + pipe->r_counter = pipe->w_counter = 1;
> + pipe->inode = inode;
> + pipe->buffers = PIPE_DEF_BUFFERS;
> + return pipe;
> + }
> + kfree(pipe);
> }
>
> - return pipe;
> + return NULL;
> }
>
> void __free_pipe_info(struct pipe_inode_info *pipe)
> {
> int i;
>
> - for (i = 0; i < PIPE_BUFFERS; i++) {
> + for (i = 0; i < pipe->buffers; i++) {
> struct pipe_buffer *buf = pipe->bufs + i;
> if (buf->ops)
> buf->ops->release(pipe, buf);
> }
> if (pipe->tmp_page)
> __free_page(pipe->tmp_page);
> + kfree(pipe->bufs);
> kfree(pipe);
> }
>
> @@ -1094,6 +1102,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
> }
>
> /*
> + * Allocate a new array of pipe buffers and copy the info over. Returns the
> + * pipe size if successful, or return -ERROR on error.
> + */
> +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
> +{
> + struct pipe_buffer *bufs;
> +
> + /*
> + * Must be a power-of-2 currently
> + */
> + if (!is_power_of_2(arg))
> + return -EINVAL;
> +
> + /*
> + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
> + * expect a lot of shrink+grow operations, just free and allocate
> + * again like we would do for growing. If the pipe currently
> + * contains more buffers than arg, then return busy.
> + */
> + if (arg < pipe->nrbufs)
> + return -EBUSY;
> +
> + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
> + if (unlikely(!bufs))
> + return -ENOMEM;
> +
> + /*
> + * The pipe array wraps around, so just start the new one at zero
> + * and adjust the indexes.
> + */
> + if (pipe->nrbufs) {
> + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
> + const unsigned int head = pipe->nrbufs - tail;
> +
> + if (head)
> + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
> + if (tail)
> + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
> + }
> +
> + pipe->curbuf = 0;
> + kfree(pipe->bufs);
> + pipe->bufs = bufs;
> + pipe->buffers = arg;
> + return arg;
> +}
> +
> +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> + struct pipe_inode_info *pipe;
> + long ret;
> +
> + pipe = file->f_path.dentry->d_inode->i_pipe;
> + if (!pipe)
> + return -EBADF;
> +
> + mutex_lock(&pipe->inode->i_mutex);
> +
> + switch (cmd) {
> + case F_SETPIPE_SZ:
> + ret = pipe_set_size(pipe, arg);
> + break;
> + case F_GETPIPE_SZ:
> + ret = pipe->buffers;
> + break;
> + default:
> + ret = -EINVAL;
> + break;
> + }
> +
> + mutex_unlock(&pipe->inode->i_mutex);
> + return ret;
> +}
> +
> +/*
> * pipefs should _never_ be mounted by userland - too much of security hassle,
> * no real gain from having the whole whorehouse mounted. So we don't need
> * any operations on the root directory. However, we need a non-trivial
> diff --git a/fs/splice.c b/fs/splice.c
> index 9313b61..39f907d 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> break;
> }
>
> - if (pipe->nrbufs < PIPE_BUFFERS) {
> - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
> + if (pipe->nrbufs < pipe->buffers) {
> + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
> struct pipe_buffer *buf = pipe->bufs + newbuf;
>
> buf->page = spd->pages[page_nr];
> @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>
> if (!--spd->nr_pages)
> break;
> - if (pipe->nrbufs < PIPE_BUFFERS)
> + if (pipe->nrbufs < pipe->buffers)
> continue;
>
> break;
> @@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
> page_cache_release(spd->pages[i]);
> }
>
> +/*
> + * Check if we need to grow the arrays holding pages and partial page
> + * descriptions.
> + */
> +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
> +{
> + if (pipe->buffers <= PIPE_DEF_BUFFERS)
> + return 0;
> +
> + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
> + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
> +
> + if (spd->pages && spd->partial)
> + return 0;
> +
> + kfree(spd->pages);
> + kfree(spd->partial);
> + return -ENOMEM;
> +}
> +
> +void splice_shrink_spd(struct pipe_inode_info *pipe,
> + struct splice_pipe_desc *spd)
> +{
> + if (pipe->buffers <= PIPE_DEF_BUFFERS)
> + return;
> +
> + kfree(spd->pages);
> + kfree(spd->partial);
> +}
> +
> static int
> __generic_file_splice_read(struct file *in, loff_t *ppos,
> struct pipe_inode_info *pipe, size_t len,
> @@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
> {
> struct address_space *mapping = in->f_mapping;
> unsigned int loff, nr_pages, req_pages;
> - struct page *pages[PIPE_BUFFERS];
> - struct partial_page partial[PIPE_BUFFERS];
> + struct page *pages[PIPE_DEF_BUFFERS];
> + struct partial_page partial[PIPE_DEF_BUFFERS];
> struct page *page;
> pgoff_t index, end_index;
> loff_t isize;
> @@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
> .spd_release = spd_release_page,
> };
>
> + if (splice_grow_spd(pipe, &spd))
> + return -ENOMEM;
> +
> index = *ppos >> PAGE_CACHE_SHIFT;
> loff = *ppos & ~PAGE_CACHE_MASK;
> req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
> + nr_pages = min(req_pages, pipe->buffers);
>
> /*
> * Lookup the (hopefully) full range of pages we need.
> */
> - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
> + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
> index += spd.nr_pages;
>
> /*
> @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
> unlock_page(page);
> }
>
> - pages[spd.nr_pages++] = page;
> + spd.pages[spd.nr_pages++] = page;
> index++;
> }
>
> @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
> * this_len is the max we'll use from this page
> */
> this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
> - page = pages[page_nr];
> + page = spd.pages[page_nr];
>
> if (PageReadahead(page))
> page_cache_async_readahead(mapping, &in->f_ra, in,

missing a hunk here:

@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *
error = -ENOMEM;
break;
}
- page_cache_release(pages[page_nr]);
- pages[page_nr] = page;
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
/*
* page was already under io and is now done, great



> @@ -451,8 +484,8 @@ fill_it:
> len = this_len;
> }
>
> - partial[page_nr].offset = loff;
> - partial[page_nr].len = this_len;
> + spd.partial[page_nr].offset = loff;
> + spd.partial[page_nr].len = this_len;
> len -= this_len;
> loff = 0;
> spd.nr_pages++;
> @@ -464,12 +497,13 @@ fill_it:
> * we got, 'nr_pages' is how many pages are in the map.
> */
> while (page_nr < nr_pages)
> - page_cache_release(pages[page_nr++]);
> + page_cache_release(spd.pages[page_nr++]);
> in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
>
> if (spd.nr_pages)
> - return splice_to_pipe(pipe, &spd);
> + error = splice_to_pipe(pipe, &spd);
>
> + splice_shrink_spd(pipe, &spd);
> return error;
> }
>
> @@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> unsigned int nr_pages;
> unsigned int nr_freed;
> size_t offset;
> - struct page *pages[PIPE_BUFFERS];
> - struct partial_page partial[PIPE_BUFFERS];
> - struct iovec vec[PIPE_BUFFERS];
> + struct page *pages[PIPE_DEF_BUFFERS];
> + struct partial_page partial[PIPE_DEF_BUFFERS];
> + struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
> pgoff_t index;
> ssize_t res;
> size_t this_len;
> @@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> .spd_release = spd_release_page,
> };
>
> + if (splice_grow_spd(pipe, &spd))
> + return -ENOMEM;
> +
> + res = -ENOMEM;
> + vec = __vec;
> + if (pipe->buffers > PIPE_DEF_BUFFERS) {
> + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
> + if (!vec)
> + goto shrink_ret;
> + }
> +
> index = *ppos >> PAGE_CACHE_SHIFT;
> offset = *ppos & ~PAGE_CACHE_MASK;
> nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>
> - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
> + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
> struct page *page;
>
> page = alloc_page(GFP_USER);

and another two:

@@ -591,7 +634,7 @@ ssize_t default_file_splice_read(struct
this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
vec[i].iov_base = (void __user *) page_address(page);
vec[i].iov_len = this_len;
- pages[i] = page;
+ spd.pages[i] = page;
spd.nr_pages++;
len -= this_len;
offset = 0;
@@ -610,11 +653,11 @@ ssize_t default_file_splice_read(struct
nr_freed = 0;
for (i = 0; i < spd.nr_pages; i++) {
this_len = min_t(size_t, vec[i].iov_len, res);
- partial[i].offset = 0;
- partial[i].len = this_len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = this_len;
if (!this_len) {
- __free_page(pages[i]);
- pages[i] = NULL;
+ __free_page(spd.pages[i]);
+ spd.pages[i] = NULL;
nr_freed++;
}
res -= this_len;



> @@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> if (res > 0)
> *ppos += res;
>
> +shrink_ret:
> + if (vec != __vec)
> + kfree(vec);
> + splice_shrink_spd(pipe, &spd);
> return res;
>
> err:
> for (i = 0; i < spd.nr_pages; i++)
> __free_page(pages[i]);
>
> - return error;
> + res = error;
> + goto shrink_ret;
> }
> EXPORT_SYMBOL(default_file_splice_read);
>
> @@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
> if (!buf->len) {
> buf->ops = NULL;
> ops->release(pipe, buf);
> - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
> + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
> pipe->nrbufs--;
> if (pipe->inode)
> sd->need_wakeup = true;
> @@ -1211,7 +1261,7 @@ out_release:
> * If we did an incomplete transfer we must release
> * the pipe buffers in question:
> */
> - for (i = 0; i < PIPE_BUFFERS; i++) {
> + for (i = 0; i < pipe->buffers; i++) {
> struct pipe_buffer *buf = pipe->bufs + i;
>
> if (buf->ops) {
> @@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
> */
> static int get_iovec_page_array(const struct iovec __user *iov,
> unsigned int nr_vecs, struct page **pages,
> - struct partial_page *partial, int aligned)
> + struct partial_page *partial, int aligned,
> + unsigned int pipe_buffers)
> {
> int buffers = 0, error = 0;
>
> @@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> break;
>
> npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
> - if (npages > PIPE_BUFFERS - buffers)
> - npages = PIPE_BUFFERS - buffers;
> + if (npages > pipe_buffers - buffers)
> + npages = pipe_buffers - buffers;
>
> error = get_user_pages_fast((unsigned long)base, npages,
> 0, &pages[buffers]);
> @@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> * or if we mapped the max number of pages that we have
> * room for.
> */
> - if (error < npages || buffers == PIPE_BUFFERS)
> + if (error < npages || buffers == pipe_buffers)
> break;
>
> nr_vecs--;
> @@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
> unsigned long nr_segs, unsigned int flags)
> {
> struct pipe_inode_info *pipe;
> - struct page *pages[PIPE_BUFFERS];
> - struct partial_page partial[PIPE_BUFFERS];
> + struct page *pages[PIPE_DEF_BUFFERS];
> + struct partial_page partial[PIPE_DEF_BUFFERS];
> struct splice_pipe_desc spd = {
> .pages = pages,
> .partial = partial,
> @@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
> .ops = &user_page_pipe_buf_ops,
> .spd_release = spd_release_page,
> };
> + long ret;
>
> pipe = pipe_info(file->f_path.dentry->d_inode);
> if (!pipe)
> return -EBADF;
>
> - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
> - flags & SPLICE_F_GIFT);
> + if (splice_grow_spd(pipe, &spd))
> + return -ENOMEM;
> +
> + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
> + spd.partial, flags & SPLICE_F_GIFT,
> + pipe->buffers);
> if (spd.nr_pages <= 0)
> - return spd.nr_pages;
> + ret = spd.nr_pages;
> + else
> + ret = splice_to_pipe(pipe, &spd);
>
> - return splice_to_pipe(pipe, &spd);
> + splice_shrink_spd(pipe, &spd);
> + return ret;
> }
>
> /*
> @@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
> * Check ->nrbufs without the inode lock first. This function
> * is speculative anyways, so missing one is ok.
> */
> - if (pipe->nrbufs < PIPE_BUFFERS)
> + if (pipe->nrbufs < pipe->buffers)
> return 0;
>
> ret = 0;
> pipe_lock(pipe);
>
> - while (pipe->nrbufs >= PIPE_BUFFERS) {
> + while (pipe->nrbufs >= pipe->buffers) {
> if (!pipe->readers) {
> send_sig(SIGPIPE, current, 0);
> ret = -EPIPE;
> @@ -1810,7 +1869,7 @@ retry:
> * Cannot make any progress, because either the input
> * pipe is empty or the output pipe is full.
> */
> - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
> + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
> /* Already processed some buffers, break */
> if (ret)
> break;
> @@ -1831,7 +1890,7 @@ retry:
> }
>
> ibuf = ipipe->bufs + ipipe->curbuf;
> - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
> + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
> obuf = opipe->bufs + nbuf;
>
> if (len >= ibuf->len) {
> @@ -1841,7 +1900,7 @@ retry:
> *obuf = *ibuf;
> ibuf->ops = NULL;
> opipe->nrbufs++;
> - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
> + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
> ipipe->nrbufs--;
> input_wakeup = true;
> } else {
> @@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
> * If we have iterated all input buffers or ran out of
> * output room, break.
> */
> - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
> + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
> break;
>
> - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
> - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
> + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
> + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
>
> /*
> * Get a reference to this pipe buffer,
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index 8603740..afc00af 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -22,6 +22,12 @@
> #define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2)
>
> /*
> + * Set and get of pipe page size array
> + */
> +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
> +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
> +
> +/*
> * Types of directory notifications that may be requested.
> */
> #define DN_ACCESS 0x00000001 /* File accessed */
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index b43a9e0..65f4282 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -3,7 +3,7 @@
>
> #define PIPEFS_MAGIC 0x50495045
>
> -#define PIPE_BUFFERS (16)
> +#define PIPE_DEF_BUFFERS 16
>
> #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
> #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
> @@ -44,17 +44,17 @@ struct pipe_buffer {
> **/
> struct pipe_inode_info {
> wait_queue_head_t wait;
> - unsigned int nrbufs, curbuf;
> - struct page *tmp_page;
> + unsigned int nrbufs, curbuf, buffers;
> unsigned int readers;
> unsigned int writers;
> unsigned int waiting_writers;
> unsigned int r_counter;
> unsigned int w_counter;
> + struct page *tmp_page;
> struct fasync_struct *fasync_readers;
> struct fasync_struct *fasync_writers;
> struct inode *inode;
> - struct pipe_buffer bufs[PIPE_BUFFERS];
> + struct pipe_buffer *bufs;
> };
>
> /*
> @@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
> int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
> void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
>
> +/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
> +long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
> +
> #endif
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index 18e7c7c..997c3b4 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
> extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
> splice_direct_actor *);
>
> +/*
> + * for dynamic pipe sizing
> + */
> +extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
> +extern void splice_shrink_spd(struct pipe_inode_info *,
> + struct splice_pipe_desc *);
> +
> #endif
> diff --git a/kernel/relay.c b/kernel/relay.c
> index 3d97f28..4268287 100644
> --- a/kernel/relay.c
> +++ b/kernel/relay.c
> @@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
> size_t read_subbuf = read_start / subbuf_size;
> size_t padding = rbuf->padding[read_subbuf];
> size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
> - struct page *pages[PIPE_BUFFERS];
> - struct partial_page partial[PIPE_BUFFERS];
> + struct page *pages[PIPE_DEF_BUFFERS];
> + struct partial_page partial[PIPE_DEF_BUFFERS];
> struct splice_pipe_desc spd = {
> .pages = pages,
> .nr_pages = 0,
> @@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
>
> if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
> return 0;
> + if (splice_grow_spd(pipe, &spd))
> + return -ENOMEM;
>
> /*
> * Adjust read len, if longer than what is available
> @@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
> subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
> pidx = (read_start / PAGE_SIZE) % subbuf_pages;
> poff = read_start & ~PAGE_MASK;
> - nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
> + nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
>
> for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
> unsigned int this_len, this_end, private;
> @@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
> }
> }
>
> + ret = 0;
> if (!spd.nr_pages)
> - return 0;
> + goto out;
>
> ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
> if (ret < 0 || ret < total_len)
> - return ret;
> + goto out;
>
> if (read_start + ret == nonpad_end)
> ret += padding;
>
> +out:
> + splice_shrink_spd(pipe, &spd);
> return ret;
> }
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 93c4e06..9319817 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1417,12 +1417,13 @@ new_page:
> /*
> * Fill page/offset/length into spd, if it can hold more pages.
> */
> -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
> +static inline int spd_fill_page(struct splice_pipe_desc *spd,
> + struct pipe_inode_info *pipe, struct page *page,
> unsigned int *len, unsigned int offset,
> struct sk_buff *skb, int linear,
> struct sock *sk)
> {
> - if (unlikely(spd->nr_pages == PIPE_BUFFERS))
> + if (unlikely(spd->nr_pages == pipe->buffers))
> return 1;
>
> if (linear) {
> @@ -1458,7 +1459,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
> unsigned int plen, unsigned int *off,
> unsigned int *len, struct sk_buff *skb,
> struct splice_pipe_desc *spd, int linear,
> - struct sock *sk)
> + struct sock *sk,
> + struct pipe_inode_info *pipe)
> {
> if (!*len)
> return 1;
> @@ -1481,7 +1483,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
> /* the linear region may spread across several pages */
> flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
>
> - if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
> + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
> return 1;
>
> __segment_seek(&page, &poff, &plen, flen);
> @@ -1496,9 +1498,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
> * Map linear and fragment data from the skb to spd. It reports failure if the
> * pipe is full or if we already spliced the requested length.
> */
> -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
> - unsigned int *len, struct splice_pipe_desc *spd,
> - struct sock *sk)
> +static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
> + unsigned int *offset, unsigned int *len,
> + struct splice_pipe_desc *spd, struct sock *sk)
> {
> int seg;
>
> @@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
> if (__splice_segment(virt_to_page(skb->data),
> (unsigned long) skb->data & (PAGE_SIZE - 1),
> skb_headlen(skb),
> - offset, len, skb, spd, 1, sk))
> + offset, len, skb, spd, 1, sk, pipe))
> return 1;
>
> /*
> @@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
> const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
>
> if (__splice_segment(f->page, f->page_offset, f->size,
> - offset, len, skb, spd, 0, sk))
> + offset, len, skb, spd, 0, sk, pipe))
> return 1;
> }
>
> @@ -1535,8 +1537,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
> struct pipe_inode_info *pipe, unsigned int tlen,
> unsigned int flags)
> {
> - struct partial_page partial[PIPE_BUFFERS];
> - struct page *pages[PIPE_BUFFERS];
> + struct partial_page partial[PIPE_DEF_BUFFERS];
> + struct page *pages[PIPE_DEF_BUFFERS];
> struct splice_pipe_desc spd = {
> .pages = pages,
> .partial = partial,
> @@ -1546,12 +1548,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
> };
> struct sk_buff *frag_iter;
> struct sock *sk = skb->sk;
> + int ret = 0;
> +
> + if (splice_grow_spd(pipe, &spd))
> + return -ENOMEM;
>
> /*
> * __skb_splice_bits() only fails if the output has no room left,
> * so no point in going over the frag_list for the error case.
> */
> - if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
> + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
> goto done;
> else if (!tlen)
> goto done;
> @@ -1562,14 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
> skb_walk_frags(skb, frag_iter) {
> if (!tlen)
> break;
> - if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
> + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
> break;
> }
>
> done:
> if (spd.nr_pages) {
> - int ret;
> -
> /*
> * Drop the socket lock, otherwise we have reverse
> * locking dependencies between sk_lock and i_mutex
> @@ -1582,10 +1586,10 @@ done:
> release_sock(sk);
> ret = splice_to_pipe(pipe, &spd);
> lock_sock(sk);
> - return ret;
> }
>
> - return 0;
> + splice_shrink_spd(pipe, &spd);
> + return ret;
> }
>
> /**
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/