Okay... this patch against the linux-2.5.72 nbd driver I think will be
to a lot more peoples liking. This patch makes NBD work with the new
linux 2.5 block layer design. Specifically, it fixes memory corruption
that results from module removal and possible memory corruption from
sending or receiving disk data from the server. This patch essentially
rolls together the changes from two of the last patchlets that I
emailed: the fix for module removal & the fix for incorrect struct bio
usage. I believe it's wisest to roll these both together into this one
patch since they both deal with making NBD work better with the 2.5
linux block layer design and without either of which, it's possible that
NBD will corrupt memory. Other changes I'd like to see introduced (like
in the earlier jumbo patch) meanwhile are feature enhancements so they
can wait. This patch also should address all the very helpful concerns
that have been raised so far. Particularly:
1. that the very first submitted NBD patch was broken down [Andrew]
2. that only 1 spinlock is used for all the NBD request_queue structures
used [Jens,Al]
3. that kmap() is used in case of highmem pages [Jens]
4. that the allocation of request_queue is dynamic and seperate from
other allocated objects [Al]
If no one has any problems with this patch, would someone be so kind as
to put it into the next distribution?
diff -urN linux-2.5.72/drivers/block/nbd.c linux-2.5.72-p1.5/drivers/block/nbd.c
--- linux-2.5.72/drivers/block/nbd.c 2003-06-16 22:19:44.000000000 -0600
+++ linux-2.5.72-p1.5/drivers/block/nbd.c 2003-06-22 12:42:05.086640056 -0600
@@ -28,6 +28,9 @@
* the transmit lock. <steve@chygwyn.com>
* 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes.
* <Paul.Clements@SteelEye.com> <James.Bottomley@SteelEye.com>
+ * 03-06-22 Make nbd work with new linux 2.5 block layer design. This fixes
+ * memory corruption from module removal and possible memory corruption
+ * from sending/receiving disk data. <ldl@aros.net>
*
* possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
* why not: would need verify_area and friends, would share yet another
@@ -63,6 +66,16 @@
static struct nbd_device nbd_dev[MAX_NBD];
+/*
+ * Use just one lock (or at most 1 per NIC). Two arguments for this:
+ * 1. Each NIC is essentially a synchronization point for all servers
+ * accessed through that NIC so there's no need to have more locks
+ * than NICs anyway.
+ * 2. More locks lead to more "Dirty cache line bouncing" which will slow
+ * down each lock to the point where they're actually slower than just
+ * a single lock.
+ * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
+ */
static spinlock_t nbd_lock = SPIN_LOCK_UNLOCKED;
#define DEBUG( s )
@@ -88,13 +101,6 @@
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static int nbd_open(struct inode *inode, struct file *file)
-{
- struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
- lo->refcnt++;
- return 0;
-}
-
/*
* Send or receive packet.
*/
@@ -168,6 +174,17 @@
return result;
}
+static inline int sock_send_bvec(struct socket *sock, struct bio_vec *bvec,
+ int flags)
+{
+ int result;
+ void *kaddr = kmap(bvec->bv_page);
+ result = nbd_xmit(1, sock, kaddr + bvec->bv_offset, bvec->bv_len,
+ flags);
+ kunmap(bvec->bv_page);
+ return result;
+}
+
#define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; }
void nbd_send_req(struct nbd_device *lo, struct request *req)
@@ -209,7 +226,7 @@
if ((i < (bio->bi_vcnt - 1)) || bio->bi_next)
flags = MSG_MORE;
DEBUG("data, ");
- result = nbd_xmit(1, sock, page_address(bvec->bv_page) + bvec->bv_offset, bvec->bv_len, flags);
+ result = sock_send_bvec(sock, bvec, flags);
if (result <= 0)
FAIL("Send data failed.");
}
@@ -244,6 +261,16 @@
return NULL;
}
+static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
+{
+ int result;
+ void *kaddr = kmap(bvec->bv_page);
+ result = nbd_xmit(0, sock, kaddr + bvec->bv_offset, bvec->bv_len,
+ MSG_WAITALL);
+ kunmap(bvec->bv_page);
+ return result;
+}
+
#define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; }
struct request *nbd_read_stat(struct nbd_device *lo)
/* NULL returned = something went wrong, inform userspace */
@@ -251,10 +278,11 @@
int result;
struct nbd_reply reply;
struct request *req;
+ struct socket *sock = lo->sock;
DEBUG("reading control, ");
reply.magic = 0;
- result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
+ result = nbd_xmit(0, sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
if (result <= 0)
HARDFAIL("Recv control failed.");
req = nbd_find_request(lo, reply.handle);
@@ -268,14 +296,17 @@
FAIL("Other side returned error.");
if (nbd_cmd(req) == NBD_CMD_READ) {
- struct bio *bio = req->bio;
+ int i;
+ struct bio *bio;
DEBUG("data, ");
- do {
- result = nbd_xmit(0, lo->sock, bio_data(bio), bio->bi_size, MSG_WAITALL);
- if (result <= 0)
- HARDFAIL("Recv data failed.");
- bio = bio->bi_next;
- } while(bio);
+ rq_for_each_bio(bio, req) {
+ struct bio_vec *bvec;
+ bio_for_each_segment(bvec, bio, i) {
+ result = sock_recv_bvec(sock, bvec);
+ if (result <= 0)
+ HARDFAIL("Recv data failed.");
+ }
+ }
}
DEBUG("done.\n");
return req;
@@ -515,12 +546,52 @@
return -EINVAL;
}
+static int nbd_open(struct inode *inode, struct file *file)
+{
+ struct block_device *bdev = inode->i_bdev;
+ struct gendisk *disk = bdev->bd_disk;
+ struct nbd_device *lo = disk->private_data;
+
+ /*
+ * linux 2.5 fs/block_dev.c ensures that while herein, this same
+ * nbd_device can not be simultaneously opened by another thread.
+ */
+ if (!disk->queue) {
+ struct request_queue *rq;
+
+ /*
+ * The new linux 2.5 block layer implementation requires
+ * every gendisk to have its very own request_queue struct.
+ * These structs are big so we dynamically allocate them and
+ * do so as late as possible (here)...
+ */
+ rq = kmalloc(sizeof(struct request_queue), GFP_KERNEL);
+ if (!rq)
+ return -ENOMEM;
+ blk_init_queue(rq, do_nbd_request, &nbd_lock);
+ disk->queue = rq;
+ }
+ lo->refcnt++;
+ return 0;
+}
+
static int nbd_release(struct inode *inode, struct file *file)
{
- struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
- if (lo->refcnt <= 0)
- printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
+ struct gendisk *disk = inode->i_bdev->bd_disk;
+ struct nbd_device *lo = disk->private_data;
+
+ /*
+ * linux 2.5 fs/block_dev.c ensures that while herein, this same
+ * nbd_device can not be simultaneously released by another thread.
+ */
lo->refcnt--;
+ if (lo->refcnt == 0) {
+ if (disk->queue) {
+ blk_cleanup_queue(disk->queue);
+ kfree(disk->queue);
+ disk->queue = NULL;
+ }
+ }
/* N.B. Doesn't lo->file need an fput?? */
return 0;
}
@@ -538,8 +609,6 @@
* (Just smiley confuses emacs :-)
*/
-static struct request_queue nbd_queue;
-
static int __init nbd_init(void)
{
int err = -ENOMEM;
@@ -562,9 +631,8 @@
goto out;
}
#ifdef MODULE
- printk("nbd: registered device at major %d\n", NBD_MAJOR);
+ printk(KERN_NOTICE "nbd: registered device at major %d\n", NBD_MAJOR);
#endif
- blk_init_queue(&nbd_queue, do_nbd_request, &nbd_lock);
devfs_mk_dir("nbd");
for (i = 0; i < MAX_NBD; i++) {
struct gendisk *disk = nbd_dev[i].disk;
@@ -582,7 +650,7 @@
disk->first_minor = i;
disk->fops = &nbd_fops;
disk->private_data = &nbd_dev[i];
- disk->queue = &nbd_queue;
+ disk->queue = NULL;
sprintf(disk->disk_name, "nbd%d", i);
sprintf(disk->devfs_name, "nbd/%d", i);
set_capacity(disk, 0x3ffffe);
@@ -600,12 +668,22 @@
{
int i;
for (i = 0; i < MAX_NBD; i++) {
- del_gendisk(nbd_dev[i].disk);
- put_disk(nbd_dev[i].disk);
+ struct gendisk *disk = nbd_dev[i].disk;
+ if (disk) {
+ if (disk->queue) {
+ blk_cleanup_queue(disk->queue);
+ kfree(disk->queue);
+ disk->queue = NULL;
+ }
+ del_gendisk(disk);
+ put_disk(disk);
+ }
}
devfs_remove("nbd");
- blk_cleanup_queue(&nbd_queue);
unregister_blkdev(NBD_MAJOR, "nbd");
+#ifdef MODULE
+ printk(KERN_NOTICE "nbd: unregistered device at major %d\n", NBD_MAJOR);
+#endif
}
module_init(nbd_init);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Mon Jun 23 2003 - 22:00:38 EST