Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and bigbuffer handling

From: Michael S. Tsirkin
Date: Mon Jan 17 2011 - 23:38:21 EST


On Tue, Jan 18, 2011 at 11:05:33AM +0800, Jason Wang wrote:
> Michael S. Tsirkin writes:
> > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
> > > Codes duplication were found between the handling of mergeable and big
> > > buffers, so this patch tries to unify them. This could be easily done
> > > by adding a quota to the get_rx_bufs() which is used to limit the
> > > number of buffers it returns (for mergeable buffer, the quota is
> > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
> > > previous handle_rx_mergeable() could be resued also for big buffers.
> > >
> > > Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
> >
> > We actually started this way. However the code turned out
> > to have measureable overhead when handle_rx_mergeable
> > is called with quota 1.
> > So before applying this I'd like to see some data
> > to show this is not the case anymore.
> >
>
> I've run a round of test (Host->Guest) for these three patches on my desktop:

Yes but what if you only apply patch 3?

> Without these patches
>
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
> Recv Send Send Utilization Service Demand
> Socket Socket Message Elapsed Send Recv Send Recv
> Size Size Size Time Throughput local remote local remote
> bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
>
> 87380 16384 64 60.00 575.87 69.20 26.36 39.375 7.499
> 87380 16384 256 60.01 1123.57 73.16 34.73 21.335 5.064
> 87380 16384 512 60.00 1351.12 75.26 35.80 18.251 4.341
> 87380 16384 1024 60.00 1955.31 74.73 37.67 12.523 3.156
> 87380 16384 2048 60.01 3411.92 74.82 39.49 7.186 1.896
>
> bug buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
> Recv Send Send Utilization Service Demand
> Socket Socket Message Elapsed Send Recv Send Recv
> Size Size Size Time Throughput local remote local remote
> bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
>
> 87380 16384 64 60.00 567.10 72.06 26.13 41.638 7.550
> 87380 16384 256 60.00 1143.69 74.66 32.58 21.392 4.667
> 87380 16384 512 60.00 1460.92 73.94 33.40 16.585 3.746
> 87380 16384 1024 60.00 3454.85 77.49 33.89 7.349 1.607
> 87380 16384 2048 60.00 3781.11 76.51 38.38 6.631 1.663
>
> With these patches:
>
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
> Recv Send Send Utilization Service Demand
> Socket Socket Message Elapsed Send Recv Send Recv
> Size Size Size Time Throughput local remote local remote
> bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
>
> 87380 16384 64 60.00 657.53 71.27 26.42 35.517 6.583
> 87380 16384 256 60.00 1217.73 74.34 34.67 20.004 4.665
> 87380 16384 512 60.00 1575.25 75.27 37.12 15.658 3.861
> 87380 16384 1024 60.00 2416.07 74.77 37.20 10.140 2.522
> 87380 16384 2048 60.00 3702.29 77.31 36.29 6.842 1.606
>
> big buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
> Recv Send Send Utilization Service Demand
> Socket Socket Message Elapsed Send Recv Send Recv
> Size Size Size Time Throughput local remote local remote
> bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
>
> 87380 16384 64 60.00 647.67 71.86 27.26 36.356 6.895
> 87380 16384 256 60.00 1265.82 76.19 36.54 19.724 4.729
> 87380 16384 512 60.00 1796.64 76.06 39.48 13.872 3.601
> 87380 16384 1024 60.00 4008.37 77.05 36.47 6.299 1.491
> 87380 16384 2048 60.00 4468.56 75.18 41.79 5.513 1.532
>
> Looks like the unification does not hurt the performance, and with those patches
> we can get some improvement. BTW, the regression of mergeable buffer still
> exist.
>
> > > ---
> > > drivers/vhost/net.c | 128 +++------------------------------------------------
> > > 1 files changed, 7 insertions(+), 121 deletions(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 95e49de..c32a2e4 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
> > > * @iovcount - returned count of io vectors we fill
> > > * @log - vhost log
> > > * @log_num - log offset
> > > + * @quota - headcount quota, 1 for big buffer
> > > * returns number of buffer heads allocated, negative on error
> > > */
> > > static int get_rx_bufs(struct vhost_virtqueue *vq,
> > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > > int datalen,
> > > unsigned *iovcount,
> > > struct vhost_log *log,
> > > - unsigned *log_num)
> > > + unsigned *log_num,
> > > + unsigned int quota)
> > > {
> > > unsigned int out, in;
> > > int seg = 0;
> > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > > unsigned d;
> > > int r, nlogs = 0;
> > >
> > > - while (datalen > 0) {
> > > + while (datalen > 0 && headcount < quota) {
> > > if (unlikely(seg >= UIO_MAXIOV)) {
> > > r = -ENOBUFS;
> > > goto err;
> > > @@ -282,116 +284,7 @@ err:
> > >
> > > /* Expects to be always run from workqueue - which acts as
> > > * read-size critical section for our kind of RCU. */
> > > -static void handle_rx_big(struct vhost_net *net)
> > > -{
> > > - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > - unsigned out, in, log, s;
> > > - int head;
> > > - struct vhost_log *vq_log;
> > > - struct msghdr msg = {
> > > - .msg_name = NULL,
> > > - .msg_namelen = 0,
> > > - .msg_control = NULL, /* FIXME: get and handle RX aux data. */
> > > - .msg_controllen = 0,
> > > - .msg_iov = vq->iov,
> > > - .msg_flags = MSG_DONTWAIT,
> > > - };
> > > -
> > > - struct virtio_net_hdr hdr = {
> > > - .flags = 0,
> > > - .gso_type = VIRTIO_NET_HDR_GSO_NONE
> > > - };
> > > -
> > > - size_t len, total_len = 0;
> > > - int err;
> > > - size_t hdr_size;
> > > - struct socket *sock = rcu_dereference(vq->private_data);
> > > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > - return;
> > > -
> > > - mutex_lock(&vq->mutex);
> > > - vhost_disable_notify(vq);
> > > - hdr_size = vq->vhost_hlen;
> > > -
> > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > > - vq->log : NULL;
> > > -
> > > - for (;;) {
> > > - head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > - ARRAY_SIZE(vq->iov),
> > > - &out, &in,
> > > - vq_log, &log);
> > > - /* On error, stop handling until the next kick. */
> > > - if (unlikely(head < 0))
> > > - break;
> > > - /* OK, now we need to know about added descriptors. */
> > > - if (head == vq->num) {
> > > - if (unlikely(vhost_enable_notify(vq))) {
> > > - /* They have slipped one in as we were
> > > - * doing that: check again. */
> > > - vhost_disable_notify(vq);
> > > - continue;
> > > - }
> > > - /* Nothing new? Wait for eventfd to tell us
> > > - * they refilled. */
> > > - break;
> > > - }
> > > - /* We don't need to be notified again. */
> > > - if (out) {
> > > - vq_err(vq, "Unexpected descriptor format for RX: "
> > > - "out %d, int %d\n",
> > > - out, in);
> > > - break;
> > > - }
> > > - /* Skip header. TODO: support TSO/mergeable rx buffers. */
> > > - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > - msg.msg_iovlen = in;
> > > - len = iov_length(vq->iov, in);
> > > - /* Sanity check */
> > > - if (!len) {
> > > - vq_err(vq, "Unexpected header len for RX: "
> > > - "%zd expected %zd\n",
> > > - iov_length(vq->hdr, s), hdr_size);
> > > - break;
> > > - }
> > > - err = sock->ops->recvmsg(NULL, sock, &msg,
> > > - len, MSG_DONTWAIT | MSG_TRUNC);
> > > - /* TODO: Check specific error and bomb out unless EAGAIN? */
> > > - if (err < 0) {
> > > - vhost_discard_vq_desc(vq, 1);
> > > - break;
> > > - }
> > > - /* TODO: Should check and handle checksum. */
> > > - if (err > len) {
> > > - pr_debug("Discarded truncated rx packet: "
> > > - " len %d > %zd\n", err, len);
> > > - vhost_discard_vq_desc(vq, 1);
> > > - continue;
> > > - }
> > > - len = err;
> > > - err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> > > - if (err) {
> > > - vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> > > - vq->iov->iov_base, err);
> > > - break;
> > > - }
> > > - len += hdr_size;
> > > - vhost_add_used_and_signal(&net->dev, vq, head, len);
> > > - if (unlikely(vq_log))
> > > - vhost_log_write(vq, vq_log, log, len);
> > > - total_len += len;
> > > - if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> > > - vhost_poll_queue(&vq->poll);
> > > - break;
> > > - }
> > > - }
> > > -
> > > - mutex_unlock(&vq->mutex);
> > > -}
> > > -
> > > -/* Expects to be always run from workqueue - which acts as
> > > - * read-size critical section for our kind of RCU. */
> > > -static void handle_rx_mergeable(struct vhost_net *net)
> > > +static void handle_rx(struct vhost_net *net)
> > > {
> > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > unsigned uninitialized_var(in), log;
> > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
> > > sock_len += sock_hlen;
> > > vhost_len = sock_len + vhost_hlen;
> > > headcount = get_rx_bufs(vq, vq->heads, vhost_len,
> > > - &in, vq_log, &log);
> > > + &in, vq_log, &log,
> > > + likely(mergeable) ? UIO_MAXIOV : 1);
> > > /* On error, stop handling until the next kick. */
> > > if (unlikely(headcount < 0))
> > > break;
> > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
> > > mutex_unlock(&vq->mutex);
> > > }
> > >
> > > -static void handle_rx(struct vhost_net *net)
> > > -{
> > > - if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
> > > - handle_rx_mergeable(net);
> > > - else
> > > - handle_rx_big(net);
> > > -}
> > > -
> > > static void handle_tx_kick(struct vhost_work *work)
> > > {
> > > struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/