Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient

From: Vlastimil Babka
Date: Fri Oct 18 2013 - 04:11:08 EST


On 10/17/2013 03:48 PM, Robert Jennings wrote:
> * Vlastimil Babka (vbabka@xxxxxxx) wrote:
>> On 10/07/2013 10:21 PM, Robert C Jennings wrote:
>>> Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
>>> pages.
>>>
>>> When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
>>> writer's gift'ed pages would be zapped. This patch supports further work
>>> to move vmsplice'd pages rather than copying them. That patch has the
>>> restriction that the page must not be mapped by the source for the move,
>>> otherwise it will fall back to copying the page.
>>>
>>> Signed-off-by: Matt Helsley <matt.helsley@xxxxxxxxx>
>>> Signed-off-by: Robert C Jennings <rcj@xxxxxxxxxxxxxxxxxx>
>>> ---
>>> Since the RFC went out I have coalesced the zap_page_range() call to
>>> operate on VMAs rather than calling this for each page. For a 256MB
>>> vmsplice this reduced the write side 50% from the RFC.
>>> ---
>>> fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>> include/linux/splice.h | 1 +
>>> 2 files changed, 51 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/splice.c b/fs/splice.c
>>> index 3b7ee65..a62d61e 100644
>>> --- a/fs/splice.c
>>> +++ b/fs/splice.c
>>> @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> {
>>> unsigned int spd_pages = spd->nr_pages;
>>> int ret, do_wakeup, page_nr;
>>> + struct vm_area_struct *vma;
>>> + unsigned long user_start, user_end;
>>>
>>> ret = 0;
>>> do_wakeup = 0;
>>> page_nr = 0;
>>> + vma = NULL;
>>> + user_start = user_end = 0;
>>>
>>> pipe_lock(pipe);
>>> + down_read(&current->mm->mmap_sem);
>>
>> Seems like you could take the mmap_sem only when GIFT and MOVE is set.
>> Maybe it won't help that much for performance but at least serve as
>> documenting the reason it's needed?
>>
>> Vlastimil
>>
>
> I had been doing that previously but moving this outside the loop and
> acquiring it once did improve performance. I'll add a comment on
> down_read() as to the reason for taking this though.
>
> -Rob

Hm perhaps in light of recent patches to reduce mmap_sem usage only to
really critical regions, maybe it really shouldn't be taken at all if
not needed.

Vlastimil

>>> for (;;) {
>>> if (!pipe->readers) {
>>> @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> buf->len = spd->partial[page_nr].len;
>>> buf->private = spd->partial[page_nr].private;
>>> buf->ops = spd->ops;
>>> - if (spd->flags & SPLICE_F_GIFT)
>>> + if (spd->flags & SPLICE_F_GIFT) {
>>> + unsigned long useraddr =
>>> + spd->partial[page_nr].useraddr;
>>> +
>>> + if ((spd->flags & SPLICE_F_MOVE) &&
>>> + !buf->offset &&
>>> + (buf->len == PAGE_SIZE)) {
>>> + /* Can move page aligned buf, gather
>>> + * requests to make a single
>>> + * zap_page_range() call per VMA
>>> + */
>>> + if (vma && (useraddr == user_end) &&
>>> + ((useraddr + PAGE_SIZE) <=
>>> + vma->vm_end)) {
>>> + /* same vma, no holes */
>>> + user_end += PAGE_SIZE;
>>> + } else {
>>> + if (vma)
>>> + zap_page_range(vma,
>>> + user_start,
>>> + (user_end -
>>> + user_start),
>>> + NULL);
>>> + vma = find_vma_intersection(
>>> + current->mm,
>>> + useraddr,
>>> + (useraddr +
>>> + PAGE_SIZE));
>>> + if (!IS_ERR_OR_NULL(vma)) {
>>> + user_start = useraddr;
>>> + user_end = (useraddr +
>>> + PAGE_SIZE);
>>> + } else
>>> + vma = NULL;
>>> + }
>>> + }
>>> buf->flags |= PIPE_BUF_FLAG_GIFT;
>>> + }
>>>
>>> pipe->nrbufs++;
>>> page_nr++;
>>> @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> pipe->waiting_writers--;
>>> }
>>>
>>> + if (vma)
>>> + zap_page_range(vma, user_start, (user_end - user_start), NULL);
>>> +
>>> + up_read(&current->mm->mmap_sem);
>>> pipe_unlock(pipe);
>>>
>>> if (do_wakeup)
>>> @@ -485,6 +530,7 @@ fill_it:
>>>
>>> spd.partial[page_nr].offset = loff;
>>> spd.partial[page_nr].len = this_len;
>>> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
>>> len -= this_len;
>>> loff = 0;
>>> spd.nr_pages++;
>>> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
>>> this_len = min_t(size_t, vec[i].iov_len, res);
>>> spd.partial[i].offset = 0;
>>> spd.partial[i].len = this_len;
>>> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
>>> if (!this_len) {
>>> __free_page(spd.pages[i]);
>>> spd.pages[i] = NULL;
>>> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>>>
>>> partial[buffers].offset = off;
>>> partial[buffers].len = plen;
>>> + partial[buffers].useraddr = (unsigned long)base;
>>> + base = (void*)((unsigned long)base + PAGE_SIZE);
>>>
>>> off = 0;
>>> len -= plen;
>>> diff --git a/include/linux/splice.h b/include/linux/splice.h
>>> index 74575cb..56661e3 100644
>>> --- a/include/linux/splice.h
>>> +++ b/include/linux/splice.h
>>> @@ -44,6 +44,7 @@ struct partial_page {
>>> unsigned int offset;
>>> unsigned int len;
>>> unsigned long private;
>>> + unsigned long useraddr;
>>> };
>>>
>>> /*
>>>
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/