[RFC] 9p: add lguest transport

From: Eric Van Hensbergen
Date: Tue Aug 28 2007 - 15:29:13 EST


From: Eric Van Hensbergen <ericvh@opteron.(none)>

This adds a transport to 9p for communicating between guest and host
domains on lguest. Currently, the host-side proxies the communication to a
socket connected to the actual server. The transport is based heavily on
the existing console code.

A better integrated server component which eliminates some of the copy
overhead is in progress and will look less like the existing console code.

Signed-off-by: Eric Van Hensbergen <ericvh@xxxxxxxxx>
---
Documentation/filesystems/9p.txt | 2 +
Documentation/lguest/lguest.c | 127 ++++++++++++++++
fs/9p/v9fs.c | 2 +-
include/linux/lguest_launcher.h | 1 +
net/9p/Kconfig | 7 +
net/9p/Makefile | 4 +
net/9p/trans_lg.c | 303 ++++++++++++++++++++++++++++++++++++++
7 files changed, 445 insertions(+), 1 deletions(-)
create mode 100644 net/9p/trans_lg.c

diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index e1879bd..1a3342f 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -48,6 +48,8 @@ OPTIONS
(see rfdno and wfdno)
pci - use a PCI pseudo device for 9p communication
over shared memory between a guest and host
+ lg - use a lguest 9p channel for communication
+ over shared memory between a guest and host

uname=name user name to attempt mount as on the remote server. The
server may override or ignore this value. Certain user
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index f791840..adc50de 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1318,6 +1318,128 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
}
/* That's the end of device setup. */

+/* 9p transport code.
+ * This code implements the host side of the 9p transport. Right now
+ * this is heavily based on the console code and just proxies data to
+ * a socket connected to an external server. Eventually we'll hook the
+ * server code in more directly like we do with lguest to avoid the
+ * socket overhead.
+ */
+/* This is the routine proxies 9p channel input */
+static bool handle_9p_input(int fd, struct device *dev)
+{
+ u32 irq = 0;
+ u32 *lenp;
+ int len = 0;
+ unsigned int num = 0;
+ struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+
+ /* First we get the console buffer from the Guest. The key is dev->mem
+ * which was set in setup_9p(). */
+
+ lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
+ if (!lenp) {
+ /* If it's not ready for input, warn and set up to discard. */
+ warn("9p: no dma buffer!");
+ discard_iovec(iov, &num);
+ }
+
+ /* This is why we convert to iovecs: the readv() call uses them, and so
+ * it reads straight into the Guest's buffer. */
+ len = readv(dev->fd, iov, num);
+ if (len == 0) {
+ /*
+ * BUG: When using msize > 1k we get zero length reads
+ * and I'm not sure why.
+ */
+ err(1, "9p: zero length read!");
+ }
+
+ if (len < 0) /* Something has gone horribly wrong */
+ errx(1, "9p: input readv returned %d", len);
+
+ /* If we read the data into the Guest, fill in the length and send the
+ * interrupt. */
+ if (lenp) {
+ *lenp = len;
+ trigger_irq(fd, irq);
+ }
+
+ /* Now, if we didn't read anything, return failure */
+ if (!len)
+ return false;
+
+ /* Everything went OK! */
+ return true;
+}
+
+/* Proxy output to socket. */
+static u32 handle_9p_output(int fd, const struct iovec *iov,
+ unsigned num, struct device*dev)
+{
+ /* Whatever the Guest sends, write it to the fd. Return the
+ * number of bytes written. */
+ return writev(dev->fd, iov, num);
+}
+
+/* Connect to 9p server (stolen from spfsclient by Lucho Ionkov) */
+/* We can't use gethostbyname because it makes us link a shared library */
+static int connect_9p(const char *arg)
+{
+ int fd, port;
+ char *addr, *p, *s;
+ struct sockaddr_in saddr;
+ u32 ipaddr;
+
+ if (!arg)
+ err(1, "9p: problem with args");
+
+ addr = strdup(arg);
+ ipaddr = str2ip(addr);
+
+ port = 567;
+ p = strrchr(addr, ':');
+ if (p) {
+ *p = '\0';
+ p++;
+ port = strtol(p, &s, 10);
+ if (*s != '\0')
+ err(1, "9p: invalid port format");
+ }
+
+ fd = socket(PF_INET, SOCK_STREAM, 0);
+ if (fd < 0)
+ err(1, "9p: problem allocating socket");
+
+
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(port);
+ saddr.sin_addr.s_addr = htonl(ipaddr);
+
+ if (connect(fd, (struct sockaddr *) &saddr, sizeof(saddr)) < 0)
+ err(1, "9p: problem connecting to server");
+
+ free(addr);
+
+ return fd;
+}
+
+/* This sets up the 9p transport */
+static void setup_9p(const char *addr, struct device_list *devices)
+{
+ struct device *dev;
+ int fd = connect_9p(addr);
+
+ /* We allocate a page to store or channel info and
+ give a unique offset for our dma key */
+ dev = new_device(devices, LGUEST_DEVICE_T_9P, 1, 0, fd,
+ handle_9p_input, 0, handle_9p_output);
+
+ verbose("device %p: 9p transport\n",
+ (void *)(dev->desc->pfn * getpagesize()));
+}
+/* End 9p Additions */
+
/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
static void __attribute__((noreturn))
@@ -1369,6 +1491,7 @@ static struct option opts[] = {
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
{ "initrd", 1, NULL, 'i' },
+ { "9p", 1, NULL, '9'},
{ NULL },
};
static void usage(void)
@@ -1376,6 +1499,7 @@ static void usage(void)
errx(1, "Usage: lguest [--verbose] "
"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
"|--block=<filename>|--initrd=<filename>]...\n"
+ "[--9p=(<ipaddr>:<port>)] "
"<mem-in-mb> vmlinux [args...]");
}

@@ -1449,6 +1573,9 @@ int main(int argc, char *argv[])
case 'i':
initrd_name = optarg;
break;
+ case '9':
+ setup_9p(optarg, &device_list);
+ break;
default:
warnx("Unknown argument %s", argv[optind]);
usage();
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08d880f..b39123b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -244,7 +244,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->maxdata = v9ses->trans->maxsize-P9_IOHDRSZ;

v9ses->clnt = p9_client_create(trans, v9ses->maxdata+P9_IOHDRSZ,
- v9ses->extended);
+ v9ses->extended);

if (IS_ERR(v9ses->clnt)) {
retval = PTR_ERR(v9ses->clnt);
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 6416705..9170046 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -90,6 +90,7 @@ struct lguest_device_desc {
#define LGUEST_DEVICE_T_CONSOLE 1
#define LGUEST_DEVICE_T_NET 2
#define LGUEST_DEVICE_T_BLOCK 3
+#define LGUEST_DEVICE_T_9P 9

/* The specific features of this device: these depends on device type
* except for LGUEST_DEVICE_F_RANDOMNESS. */
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index 8517560..fab7bb9 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -31,6 +31,13 @@ config NET_9P_PCI
under KVM/QEMU which allows for 9p transactions over shared
memory between the guest and the host.

+config NET_9P_LG
+ depends on NET_9P
+ tristate "9p Lguest Transport (Experimental)"
+ help
+ This builds support for a transport between an Lguest
+ guest partition and the host partition.
+
config NET_9P_DEBUG
bool "Debug information"
depends on NET_9P
diff --git a/net/9p/Makefile b/net/9p/Makefile
index 26ce89d..80a4227 100644
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_NET_9P) := 9pnet.o
obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o
obj-$(CONFIG_NET_9P_PCI) += 9pnet_pci.o
+obj-$(CONFIG_NET_9P_LG) += 9pnet_lg.o

9pnet-objs := \
mod.o \
@@ -18,3 +19,6 @@ obj-$(CONFIG_NET_9P_PCI) += 9pnet_pci.o

9pnet_pci-objs := \
trans_pci.o \
+
+9pnet_lg-objs := \
+ trans_lg.o \
diff --git a/net/9p/trans_lg.c b/net/9p/trans_lg.c
new file mode 100644
index 0000000..146ed01
--- /dev/null
+++ b/net/9p/trans_lg.c
@@ -0,0 +1,303 @@
+/*
+ * The Guest 9p transport driver
+ *
+ * This is a trivial pipe-based transport driver based on the lguest console
+ * code: we use lguest's DMA mechanism to send bytes out, and register a
+ * DMA buffer to receive bytes in. It is assumed to be present and available
+ * from the very beginning of boot.
+ *
+ * This may be have been done by just instaniating another HVC console,
+ * but HVC's blocksize of 16 bytes is annoying and painful to performance.
+ *
+ */
+/*
+ * Copyright (C) 2007 Eric Van Hensbergen, IBM Corporation
+ *
+ * Based on lguest console driver
+ * Copyright (C) 2006 Rusty Russel, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/lguest_bus.h>
+#include <net/9p/9p.h>
+#include <net/9p/transport.h>
+
+/* 9p Buffer Size in Pages */
+#define P9_BUF_PAGES 16
+/* 9p Buffer Size as 2^x */
+#define P9_BUF_SHIFT 4
+/* only support a single channel for now */
+#define MAX_9P_CHAN 1
+/* for channel names */
+#define NAMELEN 256
+
+/* We keep all per-channel information in a structure.
+ * This structure is allocated within the devices dev->mem space.
+ * A pointer to the structure will get put in the transport private.
+ */
+static struct lg_chan {
+ struct lguest_dma input; /* input structure for channel */
+ unsigned long offset; /* input offset */
+ unsigned long key; /* dma key */
+ char *buf; /* input buffer */
+ wait_queue_head_t wq; /* waitq for buffer */
+ struct lguest_device *dev; /* back pointer to device */
+} channels[MAX_9P_CHAN];
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+ return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/* this breaks up any dma requests along page size boundries */
+static void p9_lg_setup_dma(struct lguest_dma *i, void *buf, int len)
+{
+ int index;
+
+ /* setup first buffer to page align subsequent buffers */
+ i->addr[0] = __pa(buf);
+ if (len > PAGE_SIZE)
+ i->len[0] = rest_of_page(buf);
+ else
+ i->len[0] = len;
+ buf += i->len[0];
+ len -= i->len[0];
+
+ for (index = 1; index < LGUEST_MAX_DMA_SECTIONS; index++) {
+ if (len == 0) {
+ if (index < LGUEST_MAX_DMA_SECTIONS)
+ i->len[index] = 0;
+ break;
+ }
+ i->addr[index] = __pa(buf);
+ if (len > PAGE_SIZE)
+ i->len[index] = PAGE_SIZE;
+ else
+ i->len[index] = len;
+
+ buf += i->len[index];
+ len -= i->len[index];
+ }
+
+ if (len) {
+ printk(KERN_ERR "9p: lg: buffer didn't fit in dma %d by %d\n",
+ index, len);
+ BUG();
+ }
+}
+
+/* Since we are likely to have multi-page data and data which crosses
+ * page boundries, we need to split things up properly.
+ */
+static int p9_lg_write(struct p9_trans *trans, void *buf, int count)
+{
+ struct lguest_dma dma;
+ struct lg_chan *chan = (struct lg_chan *)trans->priv;
+
+ p9_lg_setup_dma(&dma, buf, count);
+ lguest_send_dma(chan->key, &dma);
+
+ return count;
+}
+
+/* We have started with a naive read implementation that will
+ * require an extra copy. In the near future we'll be modifying the
+ * v9fs transport infrastructure to better support zero-copy readv/writev
+ * style implementations.
+ */ static int p9_lg_read(struct p9_trans *trans, void *buf, int count)
+{
+ struct lg_chan *chan = (struct lg_chan *)trans->priv;
+
+ if (!chan->input.used_len)
+ return 0;
+
+ /* You want more than we have to give? Well, try wanting less! */
+ if (chan->input.used_len - chan->offset < count)
+ count = chan->input.used_len - chan->offset;
+
+ /* Copy across to their buffer and increment offset. */
+ memcpy(buf, chan->buf + chan->offset, count);
+ chan->offset += count;
+
+ /* Finished? Zero offset, and reset p9_lg_input so Host will use it
+ * again. */
+ if (chan->offset == chan->input.used_len) {
+ chan->input.used_len = 0;
+ chan->offset = 0;
+ }
+
+ return count;
+}
+
+/* The poll function is used by 9p transports to determine if there
+ * is there is activity available on a particular channel. In our case
+ * we use it to wait for an interrupt.
+ */
+static unsigned int
+p9_lg_poll(struct p9_trans *trans, struct poll_table_struct *pt)
+{
+ struct lg_chan *chan = (struct lg_chan *)trans->priv;
+ int ret = POLLOUT; /* we can always handle more output */
+
+ poll_wait(NULL, &chan->wq, pt);
+
+ if (chan->input.used_len)
+ ret |= POLLIN;
+
+ return ret;
+}
+
+static void p9_lg_close(struct p9_trans *trans)
+{
+ kfree(trans);
+}
+
+static irqreturn_t
+p9_lg_intr(int irq, void *arg)
+{
+ wait_queue_head_t *w = (wait_queue_head_t *) arg;
+
+ wake_up_interruptible(w);
+
+ return IRQ_HANDLED;
+}
+
+/* This registers available probe devices with the kernel. Right now
+ * we really only support a single channel -- but things are setup to allow
+ * for multiple channels */
+static int p9_lg_probe(struct lguest_device *lgdev)
+{
+ static int chan_index;
+ struct lg_chan *chan = &channels[chan_index++];
+ int err;
+
+ if (chan_index > MAX_9P_CHAN) {
+ printk(KERN_ERR "9p: lg: Maximum channels exceeded\n");
+ BUG();
+ }
+
+ lgdev->private = (void *) chan;
+ chan->key = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
+
+ /* Allocate 16 pages */
+ chan->buf = (char *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+ P9_BUF_SHIFT);
+ if (chan->buf == 0)
+ BUG();
+
+ p9_lg_setup_dma(&chan->input, chan->buf, PAGE_SIZE*P9_BUF_PAGES);
+
+ chan->input.used_len = 0;
+
+ /* We bind a single DMA buffer using the channel's key which is set
+ * to dev->mem, and we also give the interrupt we want. */
+ err = lguest_bind_dma(chan->key, &chan->input, P9_BUF_PAGES,
+ lgdev_irq(lgdev));
+ if (err) {
+ printk(KERN_ERR "9p: lg: failed to bind buffer.\n");
+ BUG();
+ }
+
+ init_waitqueue_head(&chan->wq);
+ err = request_irq(lgdev_irq(lgdev), &p9_lg_intr, 0, "p9_lg",
+ &chan->wq);
+ if (err) {
+ printk(KERN_ERR "9p: lg: failed to obtain irq.\n");
+ BUG();
+ }
+
+ return 0;
+}
+
+/* The standard "struct lguest_driver": */
+static struct lguest_driver p9_lg_drv = {
+ .name = "9p_lg",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_9P,
+ .probe = p9_lg_probe,
+};
+
+/* This sets up a transport channel for 9p communication. Right now
+ * we only match the first channel, but eventually we'll be able to look up
+ * alternate channels by matching devname versus chan->name. We use a simple
+ * reference count mechanism to ensure that only a single mount has a
+ * channel open at a time. */
+static struct p9_trans *p9_lg_create(const char *devname, char *args)
+{
+ struct p9_trans *trans;
+ struct lg_chan *chan = channels; /* don't bother w/match now */
+
+ if (strcmp(paravirt_ops.name, "lguest") != 0) {
+ printk(KERN_ERR "9p: not running on lguest, no lg possible\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
+ if (!trans) {
+ printk(KERN_ERR "9p: couldn't allocate transport\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trans->write = p9_lg_write;
+ trans->read = p9_lg_read;
+ trans->close = p9_lg_close;
+ trans->poll = p9_lg_poll;
+ trans->priv = chan;
+
+ return trans;
+}
+
+static struct p9_trans_module p9_lg_trans = {
+ .name = "lg",
+ .create = p9_lg_create,
+ .maxsize = 1024,
+ .def = 0,
+};
+
+/* The standard init function */
+static int __init p9_lg_init(void)
+{
+ v9fs_register_trans(&p9_lg_trans);
+ return register_lguest_driver(&p9_lg_drv);
+}
+
+static void __exit p9_lg_cleanup(void)
+{
+ printk(KERN_ERR "Removal of 9p transports not implemented\n");
+ BUG();
+}
+
+module_init(p9_lg_init);
+module_exit(p9_lg_cleanup);
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@xxxxxxxxx>");
+MODULE_DESCRIPTION("9p Lguest Pipe");
+MODULE_LICENSE("GPL");
+
--
1.5.0.2.gfbe3d-dirty

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/