[PATCH 34/50] mars: add new file drivers/block/mars/xio_bricks/xio_if.c

From: Thomas Schoebel-Theuer
Date: Tue Jul 01 2014 - 17:57:08 EST


Signed-off-by: Thomas Schoebel-Theuer <tst@xxxxxxxxxxxxxxxxxx>
---
drivers/block/mars/xio_bricks/xio_if.c | 1037 ++++++++++++++++++++++++++++++++
1 file changed, 1037 insertions(+)
create mode 100644 drivers/block/mars/xio_bricks/xio_if.c

diff --git a/drivers/block/mars/xio_bricks/xio_if.c b/drivers/block/mars/xio_bricks/xio_if.c
new file mode 100644
index 0000000..2c2fa42
--- /dev/null
+++ b/drivers/block/mars/xio_bricks/xio_if.c
@@ -0,0 +1,1037 @@
+/* (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG */
+
+/* Interface to a Linux device.
+ * 1 Input, 0 Outputs.
+ */
+
+#define REQUEST_MERGING
+#define ALWAYS_UNPLUG true
+#define PREFETCH_LEN PAGE_SIZE
+
+/* low-level device parameters */
+#define USE_MAX_SECTORS (XIO_MAX_SEGMENT_SIZE >> 9)
+#define USE_MAX_PHYS_SEGMENTS (XIO_MAX_SEGMENT_SIZE >> 9)
+#define USE_MAX_SEGMENT_SIZE XIO_MAX_SEGMENT_SIZE
+#define USE_LOGICAL_BLOCK_SIZE 512
+#define USE_SEGMENT_BOUNDARY (PAGE_SIZE-1)
+
+#define USE_CONGESTED_FN
+#define USE_MERGE_BVEC
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <linux/bio.h>
+#include <linux/major.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include <linux/xio.h>
+#include <linux/brick/lib_limiter.h>
+
+#ifndef XIO_MAJOR /* remove this later: fallback to old prepatch */
+#define XIO_MAJOR MARS_MAJOR
+#endif
+
+/************************ global tuning ***********************/
+
+int if_throttle_start_size = 0; /* in kb */
+EXPORT_SYMBOL_GPL(if_throttle_start_size);
+
+struct xio_limiter if_throttle = {
+ .lim_max_rate = 5000,
+};
+EXPORT_SYMBOL_GPL(if_throttle);
+
+/************************ own type definitions ***********************/
+
+#include <linux/xio/xio_if.h>
+
+#define IF_HASH_MAX (PAGE_SIZE / sizeof(struct if_hash_anchor))
+#define IF_HASH_CHUNK (PAGE_SIZE * 32)
+
+struct if_hash_anchor {
+ spinlock_t hash_lock;
+ struct list_head hash_anchor;
+};
+
+/************************ own static definitions ***********************/
+
+/* TODO: check bounds, ensure that free minor numbers are recycled */
+static int device_minor;
+
+/*************** object * aspect constructors * destructors **************/
+
+/************************ linux operations ***********************/
+
+#ifdef part_stat_lock
+static
+void _if_start_io_acct(struct if_input *input, struct bio_wrapper *biow)
+{
+ struct bio *bio = biow->bio;
+ const int rw = bio_data_dir(bio);
+ const int cpu = part_stat_lock();
+
+ (void)cpu;
+ part_round_stats(cpu, &input->disk->part0);
+ part_stat_inc(cpu, &input->disk->part0, ios[rw]);
+ part_stat_add(cpu, &input->disk->part0, sectors[rw], bio->bi_iter.bi_size >> 9);
+ part_inc_in_flight(&input->disk->part0, rw);
+ part_stat_unlock();
+ biow->start_time = jiffies;
+}
+
+static
+void _if_end_io_acct(struct if_input *input, struct bio_wrapper *biow)
+{
+ unsigned long duration = jiffies - biow->start_time;
+ struct bio *bio = biow->bio;
+ const int rw = bio_data_dir(bio);
+ const int cpu = part_stat_lock();
+
+ (void)cpu;
+ part_stat_add(cpu, &input->disk->part0, ticks[rw], duration);
+ part_round_stats(cpu, &input->disk->part0);
+ part_dec_in_flight(&input->disk->part0, rw);
+ part_stat_unlock();
+}
+
+#else /* part_stat_lock */
+#define _if_start_io_acct(...) do {} while (0)
+#define _if_end_io_acct(...) do {} while (0)
+#endif
+
+/* callback
+ */
+static
+void if_endio(struct generic_callback *cb)
+{
+ struct if_aio_aspect *aio_a = cb->cb_private;
+ struct if_input *input;
+ int k;
+ int rw;
+ int error;
+
+ LAST_CALLBACK(cb);
+ if (unlikely(!aio_a || !aio_a->object)) {
+ XIO_FAT("aio_a = %p aio = %p, something is very wrong here!\n", aio_a, aio_a->object);
+ goto out_return;
+ }
+ input = aio_a->input;
+ CHECK_PTR(input, err);
+
+ rw = aio_a->object->io_rw;
+
+ for (k = 0; k < aio_a->bio_count; k++) {
+ struct bio_wrapper *biow;
+ struct bio *bio;
+
+ biow = aio_a->orig_biow[k];
+ aio_a->orig_biow[k] = NULL;
+ CHECK_PTR(biow, err);
+
+ CHECK_ATOMIC(&biow->bi_comp_cnt, 1);
+ if (!atomic_dec_and_test(&biow->bi_comp_cnt))
+ continue;
+
+ bio = biow->bio;
+ CHECK_PTR_NULL(bio, err);
+
+ _if_end_io_acct(input, biow);
+
+ error = CALLBACK_ERROR(aio_a->object);
+ if (unlikely(error < 0)) {
+ int bi_size = bio->bi_iter.bi_size;
+
+ XIO_ERR("NYI: error=%d RETRY LOGIC %u\n", error, bi_size);
+ } else { /* bio conventions are slightly different... */
+ error = 0;
+ bio->bi_iter.bi_size = 0;
+ }
+ bio_endio(bio, error);
+ bio_put(bio);
+ brick_mem_free(biow);
+ }
+ atomic_dec(&input->flying_count);
+ if (rw)
+ atomic_dec(&input->write_flying_count);
+ else
+ atomic_dec(&input->read_flying_count);
+ goto out_return;
+err:
+ XIO_FAT("error in callback, giving up\n");
+out_return:;
+}
+
+/* Kick off plugged aios
+ */
+static
+void _if_unplug(struct if_input *input)
+{
+ /* struct if_brick *brick = input->brick; */
+ LIST_HEAD(tmp_list);
+
+#ifdef CONFIG_MARS_DEBUG
+ might_sleep();
+#endif
+
+ down(&input->kick_sem);
+ spin_lock(&input->req_lock);
+ if (!list_empty(&input->plug_anchor)) {
+ /* move over the whole list */
+ list_replace_init(&input->plug_anchor, &tmp_list);
+ atomic_set(&input->plugged_count, 0);
+ }
+ spin_unlock(&input->req_lock);
+ up(&input->kick_sem);
+
+ while (!list_empty(&tmp_list)) {
+ struct if_aio_aspect *aio_a;
+ struct aio_object *aio;
+ int hash_index;
+
+ aio_a = container_of(tmp_list.next, struct if_aio_aspect, plug_head);
+ list_del_init(&aio_a->plug_head);
+
+ hash_index = aio_a->hash_index;
+ spin_lock(&input->hash_table[hash_index].hash_lock);
+ list_del_init(&aio_a->hash_head);
+ spin_unlock(&input->hash_table[hash_index].hash_lock);
+
+ aio = aio_a->object;
+
+ if (unlikely(aio_a->current_len > aio_a->max_len))
+ XIO_ERR("request len %d > %d\n", aio_a->current_len, aio_a->max_len);
+ aio->io_len = aio_a->current_len;
+
+ atomic_inc(&input->flying_count);
+ atomic_inc(&input->total_fire_count);
+ if (aio->io_rw)
+ atomic_inc(&input->write_flying_count);
+ else
+ atomic_inc(&input->read_flying_count);
+ if (aio->io_skip_sync)
+ atomic_inc(&input->total_skip_sync_count);
+
+ GENERIC_INPUT_CALL(input, aio_io, aio);
+ GENERIC_INPUT_CALL(input, aio_put, aio);
+ }
+}
+
+/* accept a linux bio, convert to aio and call buf_io() on it.
+ */
+static
+#ifdef BIO_CPU_AFFINE
+int
+#else
+void
+#endif
+if_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct if_input *input = q->queuedata;
+ struct if_brick *brick = input->brick;
+
+ /* Original flags of the source bio
+ */
+ const int rw = bio_data_dir(bio);
+ const int sectors = bio_sectors(bio);
+
+/* adapt to different kernel versions (TBD: improve) */
+#if defined(BIO_RW_RQ_MASK) || defined(BIO_FLUSH)
+ const bool ahead = bio_rw_flagged(bio, BIO_RW_AHEAD) && rw == READ;
+ const bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+ const bool syncio = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+ const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
+ const bool meta = bio_rw_flagged(bio, BIO_RW_META);
+ const bool discard = bio_rw_flagged(bio, BIO_RW_DISCARD);
+ const bool noidle = bio_rw_flagged(bio, BIO_RW_NOIDLE);
+
+#elif defined(REQ_FLUSH) && defined(REQ_SYNC)
+#define _flagged(x) (bio->bi_rw & (x))
+ const bool ahead = _flagged(REQ_RAHEAD) && rw == READ;
+ const bool barrier = _flagged(REQ_FLUSH);
+ const bool syncio = _flagged(REQ_SYNC);
+ const bool unplug = false;
+ const bool meta = _flagged(REQ_META);
+ const bool discard = _flagged(REQ_DISCARD);
+ const bool noidle = _flagged(REQ_THROTTLED);
+
+#else
+#error Cannot decode the bio flags
+#endif
+ const int prio = bio_prio(bio);
+
+ /* Transform into XIO flags
+ */
+ const int io_prio =
+ (prio == IOPRIO_CLASS_RT || (meta | syncio)) ?
+ XIO_PRIO_HIGH :
+ (prio == IOPRIO_CLASS_IDLE) ?
+ XIO_PRIO_LOW :
+ XIO_PRIO_NORMAL;
+ const bool do_unplug = ALWAYS_UNPLUG | unplug | noidle;
+ const bool do_skip_sync = brick->skip_sync && !(barrier | syncio);
+
+ struct bio_wrapper *biow;
+ struct aio_object *aio = NULL;
+ struct if_aio_aspect *aio_a;
+
+ struct bio_vec bvec;
+ struct bvec_iter i;
+
+ loff_t pos = ((loff_t)bio->bi_iter.bi_sector) << 9; /* TODO: make dynamic */
+ int total_len = bio->bi_iter.bi_size;
+
+ bool assigned = false;
+ int error = -ENOSYS;
+
+ bind_to_channel(brick->say_channel, current);
+
+ might_sleep();
+
+ if (unlikely(!sectors)) {
+ _if_unplug(input);
+ /* THINK: usually this happens only at write barriers.
+ * We have no "barrier" operation in XIO, since
+ * callback semantics should always denote
+ * "writethrough accomplished".
+ * In case of exceptional semantics, we need to do
+ * something here. For now, we do just nothing.
+ */
+ bio_endio(bio, 0);
+ error = 0;
+ goto done;
+ }
+
+ /* throttling of too big write requests */
+ if (rw && if_throttle_start_size > 0) {
+ int kb = (total_len + 512) / 1024;
+
+ if (kb >= if_throttle_start_size)
+ xio_limit_sleep(&if_throttle, kb);
+ }
+
+ (void)ahead; /* shut up gcc */
+ if (unlikely(discard)) { /* NYI */
+ bio_endio(bio, 0);
+ error = 0;
+ goto done;
+ }
+
+ biow = brick_mem_alloc(sizeof(struct bio_wrapper));
+ biow->bio = bio;
+ atomic_set(&biow->bi_comp_cnt, 0);
+
+ if (rw)
+ atomic_inc(&input->total_write_count);
+ else
+ atomic_inc(&input->total_read_count);
+ _if_start_io_acct(input, biow);
+
+ /* Get a reference to the bio.
+ * Will be released after bio_endio().
+ */
+ atomic_inc(&bio->bi_cnt);
+
+ /* FIXME: THIS IS PROVISIONARY (use event instead)
+ */
+ while (unlikely(!brick->power.on_led))
+ brick_msleep(100);
+
+ down(&input->kick_sem);
+
+ bio_for_each_segment(bvec, bio, i) {
+ struct page *page = bvec.bv_page;
+ int bv_len = bvec.bv_len;
+ int offset = bvec.bv_offset;
+
+ void *data;
+
+#ifdef ARCH_HAS_KMAP
+#error FIXME: the current infrastructure cannot deal with HIGHMEM / kmap()
+#endif
+ data = page_address(page);
+ error = -EINVAL;
+ if (unlikely(!data))
+ break;
+
+ data += offset;
+
+ while (bv_len > 0) {
+ struct list_head *tmp;
+ int hash_index;
+ int this_len = 0;
+
+ aio = NULL;
+ aio_a = NULL;
+
+ hash_index = (pos / IF_HASH_CHUNK) % IF_HASH_MAX;
+
+#ifdef REQUEST_MERGING
+ spin_lock(&input->hash_table[hash_index].hash_lock);
+ for (tmp = input->hash_table[hash_index].hash_anchor.next; tmp != &input->hash_table[hash_index].hash_anchor; tmp = tmp->next) {
+ struct if_aio_aspect *tmp_a;
+ struct aio_object *tmp_aio;
+ int i;
+
+ tmp_a = container_of(tmp, struct if_aio_aspect, hash_head);
+ tmp_aio = tmp_a->object;
+ if (tmp_a->orig_page != page || tmp_aio->io_rw != rw || tmp_a->bio_count >= MAX_BIO || tmp_a->current_len + bv_len > tmp_a->max_len)
+ continue;
+
+ if (tmp_aio->io_data + tmp_a->current_len == data)
+ goto merge_end;
+ continue;
+
+merge_end:
+ tmp_a->current_len += bv_len;
+ aio = tmp_aio;
+ aio_a = tmp_a;
+ this_len = bv_len;
+ if (!do_skip_sync)
+ aio->io_skip_sync = false;
+
+ for (i = 0; i < aio_a->bio_count; i++) {
+ if (aio_a->orig_biow[i]->bio == bio)
+ goto unlock;
+ }
+
+ CHECK_ATOMIC(&biow->bi_comp_cnt, 0);
+ atomic_inc(&biow->bi_comp_cnt);
+ aio_a->orig_biow[aio_a->bio_count++] = biow;
+ assigned = true;
+ goto unlock;
+ } /* foreach hash collision list member */
+
+unlock:
+ spin_unlock(&input->hash_table[hash_index].hash_lock);
+#endif
+ if (!aio) {
+ int prefetch_len;
+
+ error = -ENOMEM;
+ aio = if_alloc_aio(brick);
+ aio_a = if_aio_get_aspect(brick, aio);
+ if (unlikely(!aio_a)) {
+ up(&input->kick_sem);
+ goto err;
+ }
+
+#ifdef PREFETCH_LEN
+ prefetch_len = PREFETCH_LEN - offset;
+/**/
+ if (prefetch_len > total_len)
+ prefetch_len = total_len;
+ if (pos + prefetch_len > brick->dev_size)
+ prefetch_len = brick->dev_size - pos;
+ if (prefetch_len < bv_len)
+ prefetch_len = bv_len;
+#else
+ prefetch_len = bv_len;
+#endif
+
+ SETUP_CALLBACK(aio, if_endio, aio_a);
+
+ aio_a->input = input;
+ aio->io_rw = aio->io_may_write = rw;
+ aio->io_pos = pos;
+ aio->io_len = prefetch_len;
+ aio->io_data = data; /* direct IO */
+ aio->io_prio = io_prio;
+ aio_a->orig_page = page;
+
+ error = GENERIC_INPUT_CALL(input, aio_get, aio);
+ if (unlikely(error < 0)) {
+ up(&input->kick_sem);
+ goto err;
+ }
+
+ this_len = aio->io_len; /* now may be shorter than originally requested. */
+ aio_a->max_len = this_len;
+ if (this_len > bv_len)
+ this_len = bv_len;
+ aio_a->current_len = this_len;
+ if (rw)
+ atomic_inc(&input->total_aio_write_count);
+ else
+ atomic_inc(&input->total_aio_read_count);
+ CHECK_ATOMIC(&biow->bi_comp_cnt, 0);
+ atomic_inc(&biow->bi_comp_cnt);
+ aio_a->orig_biow[0] = biow;
+ aio_a->bio_count = 1;
+ assigned = true;
+
+ /* When a bio with multiple biovecs is split into
+ * multiple aios, only the last one should be
+ * working in synchronous writethrough mode.
+ */
+ aio->io_skip_sync = true;
+ if (!do_skip_sync && i.bi_idx + 1 >= bio->bi_iter.bi_idx)
+ aio->io_skip_sync = false;
+
+ atomic_inc(&input->plugged_count);
+
+ aio_a->hash_index = hash_index;
+ spin_lock(&input->hash_table[hash_index].hash_lock);
+ list_add_tail(&aio_a->hash_head, &input->hash_table[hash_index].hash_anchor);
+ spin_unlock(&input->hash_table[hash_index].hash_lock);
+
+ spin_lock(&input->req_lock);
+ list_add_tail(&aio_a->plug_head, &input->plug_anchor);
+ spin_unlock(&input->req_lock);
+ } /* !aio */
+
+ pos += this_len;
+ data += this_len;
+ bv_len -= this_len;
+ total_len -= this_len;
+ } /* while bv_len > 0 */
+ } /* foreach bvec */
+
+ up(&input->kick_sem);
+
+ if (likely(!total_len))
+ error = 0;
+ else
+ XIO_ERR("bad rest len = %d\n", total_len);
+err:
+ if (error < 0) {
+ XIO_ERR("cannot submit request from bio, status=%d\n", error);
+ if (!assigned)
+ bio_endio(bio, error);
+ }
+
+ if (do_unplug ||
+ (brick && brick->max_plugged > 0 && atomic_read(&input->plugged_count) > brick->max_plugged)) {
+ _if_unplug(input);
+ }
+
+done:
+ remove_binding_from(brick->say_channel, current);
+
+#ifdef BIO_CPU_AFFINE
+ return error;
+#else
+ goto out_return;
+#endif
+out_return:;
+}
+
+#ifndef BLK_MAX_REQUEST_COUNT
+/* static */
+void if_unplug(struct request_queue *q)
+{
+ struct if_input *input = q->queuedata;
+
+ spin_lock_irq(q->queue_lock);
+ was_plugged = blk_remove_plug(q);
+ spin_unlock_irq(q->queue_lock);
+
+ _if_unplug(input);
+}
+#endif
+
+/* static */
+int xio_congested(void *data, int bdi_bits)
+{
+ struct if_input *input = data;
+ int ret = 0;
+
+ if (bdi_bits & (1 << BDI_sync_congested) &&
+ atomic_read(&input->read_flying_count) > 0) {
+ ret |= (1 << BDI_sync_congested);
+ }
+ if (bdi_bits & (1 << BDI_async_congested) &&
+ atomic_read(&input->write_flying_count) > 0) {
+ ret |= (1 << BDI_async_congested);
+ }
+ return ret;
+}
+
+static
+int xio_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+ unsigned int bio_size = bvm->bi_size;
+
+ if (!bio_size)
+ return bvec->bv_len;
+ return 128;
+}
+
+static
+loff_t if_get_capacity(struct if_brick *brick)
+{
+ /* Don't read always, read only when unknown.
+ * brick->dev_size may be different from underlying sizes,
+ * e.g. when the size symlink indicates a logically smaller
+ * device than physically.
+ */
+ if (brick->dev_size <= 0) {
+ struct xio_info info = {};
+ struct if_input *input = brick->inputs[0];
+ int status;
+
+ status = GENERIC_INPUT_CALL(input, xio_get_info, &info);
+ if (unlikely(status < 0)) {
+ XIO_ERR("cannot get device info, status=%d\n", status);
+ return 0;
+ }
+ XIO_INF("determined default capacity: %lld bytes\n", info.current_size);
+ brick->dev_size = info.current_size;
+ }
+ return brick->dev_size;
+}
+
+static
+void if_set_capacity(struct if_input *input, loff_t capacity)
+{
+ CHECK_PTR(input->disk, done);
+ CHECK_PTR(input->disk->disk_name, done);
+ XIO_INF("new capacity of '%s': %lld bytes\n", input->disk->disk_name, capacity);
+ input->capacity = capacity;
+ set_capacity(input->disk, capacity >> 9);
+ if (likely(input->bdev && input->bdev->bd_inode))
+ i_size_write(input->bdev->bd_inode, capacity);
+done:;
+}
+
+static const struct block_device_operations if_blkdev_ops;
+
+static int if_switch(struct if_brick *brick)
+{
+ struct if_input *input = brick->inputs[0];
+ struct request_queue *q;
+ struct gendisk *disk;
+ int minor;
+ int status = 0;
+
+ down(&brick->switch_sem);
+
+ /* brick is in operation */
+ if (brick->power.button && brick->power.on_led) {
+ loff_t capacity;
+
+ capacity = if_get_capacity(brick);
+ if (capacity > 0 && capacity != input->capacity) {
+ XIO_INF("changing capacity from %lld to %lld\n",
+ (long long)input->capacity,
+ (long long)capacity);
+ if_set_capacity(input, capacity);
+ }
+ }
+
+ /* brick should be switched on */
+ if (brick->power.button && brick->power.off_led) {
+ loff_t capacity;
+
+ xio_set_power_off_led((void *)brick, false);
+ brick->say_channel = get_binding(current);
+
+ status = -ENOMEM;
+ q = blk_alloc_queue(GFP_BRICK);
+ if (!q) {
+ XIO_ERR("cannot allocate device request queue\n");
+ goto is_down;
+ }
+ q->queuedata = input;
+ input->q = q;
+
+ disk = alloc_disk(1);
+ if (!disk) {
+ XIO_ERR("cannot allocate gendisk\n");
+ goto is_down;
+ }
+
+ minor = device_minor++; /* TODO: protect against races (e.g. atomic_t) */
+ set_disk_ro(disk, true);
+
+ disk->queue = q;
+ disk->major = XIO_MAJOR; /* TODO: make this dynamic for >256 devices */
+ disk->first_minor = minor;
+ disk->fops = &if_blkdev_ops;
+ snprintf(disk->disk_name, sizeof(disk->disk_name), "%s", brick->brick_name);
+ disk->private_data = input;
+ input->disk = disk;
+ capacity = if_get_capacity(brick);
+ XIO_DBG("created device name %s, capacity=%lld\n", disk->disk_name, capacity);
+ if_set_capacity(input, capacity);
+
+ blk_queue_make_request(q, if_make_request);
+#ifdef USE_MAX_SECTORS
+#ifdef MAX_SEGMENT_SIZE
+ XIO_DBG("blk_queue_max_sectors()\n");
+ blk_queue_max_sectors(q, USE_MAX_SECTORS);
+#else
+ XIO_DBG("blk_queue_max_hw_sectors()\n");
+ blk_queue_max_hw_sectors(q, USE_MAX_SECTORS);
+#endif
+#endif
+#ifdef USE_MAX_PHYS_SEGMENTS
+#ifdef MAX_SEGMENT_SIZE
+ XIO_DBG("blk_queue_max_phys_segments()\n");
+ blk_queue_max_phys_segments(q, USE_MAX_PHYS_SEGMENTS);
+#else
+ XIO_DBG("blk_queue_max_segments()\n");
+ blk_queue_max_segments(q, USE_MAX_PHYS_SEGMENTS);
+#endif
+#endif
+#ifdef USE_MAX_HW_SEGMENTS
+ XIO_DBG("blk_queue_max_hw_segments()\n");
+ blk_queue_max_hw_segments(q, USE_MAX_HW_SEGMENTS);
+#endif
+#ifdef USE_MAX_SEGMENT_SIZE
+ XIO_DBG("blk_queue_max_segment_size()\n");
+ blk_queue_max_segment_size(q, USE_MAX_SEGMENT_SIZE);
+#endif
+#ifdef USE_LOGICAL_BLOCK_SIZE
+ XIO_DBG("blk_queue_logical_block_size()\n");
+ blk_queue_logical_block_size(q, USE_LOGICAL_BLOCK_SIZE);
+#endif
+#ifdef USE_SEGMENT_BOUNDARY
+ XIO_DBG("blk_queue_segment_boundary()\n");
+ blk_queue_segment_boundary(q, USE_SEGMENT_BOUNDARY);
+#endif
+#ifdef QUEUE_ORDERED_DRAIN
+ XIO_DBG("blk_queue_ordered()\n");
+ blk_queue_ordered(q, QUEUE_ORDERED_DRAIN, NULL);
+#endif
+ XIO_DBG("blk_queue_bounce_limit()\n");
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+#ifndef BLK_MAX_REQUEST_COUNT
+ XIO_DBG("unplug_fn\n");
+ q->unplug_fn = if_unplug;
+#endif
+ XIO_DBG("queue_lock\n");
+ q->queue_lock = &input->req_lock; /* needed! */
+
+ input->bdev = bdget(MKDEV(disk->major, minor));
+ /* we have no partitions. we contain only ourselves. */
+ input->bdev->bd_contains = input->bdev;
+
+#ifdef USE_CONGESTED_FN
+ XIO_DBG("congested_fn\n");
+ q->backing_dev_info.congested_fn = xio_congested;
+ q->backing_dev_info.congested_data = input;
+#endif
+#ifdef USE_MERGE_BVEC
+ XIO_DBG("blk_queue_merge_bvec()\n");
+ blk_queue_merge_bvec(q, xio_merge_bvec);
+#endif
+
+ /* point of no return */
+ XIO_DBG("add_disk()\n");
+ add_disk(disk);
+ set_disk_ro(disk, false);
+
+ /* report success */
+ xio_set_power_on_led((void *)brick, true);
+ status = 0;
+ }
+
+ /* brick should be switched off */
+ if (!brick->power.button && !brick->power.off_led) {
+ int flying;
+
+ xio_set_power_on_led((void *)brick, false);
+ disk = input->disk;
+ if (!disk)
+ goto is_down;
+
+ if (atomic_read(&brick->open_count) > 0) {
+ XIO_INF("device '%s' is open %d times, cannot shutdown\n",
+ disk->disk_name,
+ atomic_read(&brick->open_count));
+ status = -EBUSY;
+ goto done; /* don't indicate "off" status */
+ }
+ flying = atomic_read(&input->flying_count);
+ if (flying > 0) {
+ XIO_INF("device '%s' has %d flying requests, cannot shutdown\n", disk->disk_name, flying);
+ status = -EBUSY;
+ goto done; /* don't indicate "off" status */
+ }
+ if (input->bdev) {
+ XIO_DBG("calling bdput()\n");
+ bdput(input->bdev);
+ input->bdev = NULL;
+ }
+ XIO_DBG("calling del_gendisk()\n");
+ del_gendisk(input->disk);
+ XIO_DBG("calling put_disk()\n");
+ put_disk(input->disk);
+ input->disk = NULL;
+ status = 0;
+is_down:
+ xio_set_power_off_led((void *)brick, true);
+ }
+
+done:
+ up(&brick->switch_sem);
+ return status;
+}
+
+/*************** interface to the outer world (kernel) **************/
+
+static int if_open(struct block_device *bdev, fmode_t mode)
+{
+ struct if_input *input;
+ struct if_brick *brick;
+
+ if (unlikely(!bdev || !bdev->bd_disk)) {
+ XIO_ERR("----------------------- INVAL ------------------------------\n");
+ return -EINVAL;
+ }
+
+ input = bdev->bd_disk->private_data;
+
+ if (unlikely(!input || !input->brick)) {
+ XIO_ERR("----------------------- BAD IF SETUP ------------------------------\n");
+ return -EINVAL;
+ }
+ brick = input->brick;
+
+ down(&brick->switch_sem);
+
+ if (unlikely(!brick->power.on_led)) {
+ XIO_INF("----------------------- BUSY %d ------------------------------\n",
+ atomic_read(&brick->open_count));
+ up(&brick->switch_sem);
+ return -EBUSY;
+ }
+
+ atomic_inc(&brick->open_count);
+
+ XIO_INF("----------------------- OPEN %d ------------------------------\n", atomic_read(&brick->open_count));
+
+ up(&brick->switch_sem);
+ return 0;
+}
+
+static
+void
+if_release(struct gendisk *gd, fmode_t mode)
+{
+ struct if_input *input = gd->private_data;
+ struct if_brick *brick = input->brick;
+ int nr;
+
+ XIO_INF("----------------------- CLOSE %d ------------------------------\n", atomic_read(&brick->open_count));
+
+ if (atomic_dec_and_test(&brick->open_count)) {
+ while ((nr = atomic_read(&input->flying_count)) > 0) {
+ XIO_INF("%d IO requests not yet completed\n", nr);
+ brick_msleep(1000);
+ }
+
+ XIO_DBG("status button=%d on_led=%d off_led=%d\n",
+ brick->power.button,
+ brick->power.on_led,
+ brick->power.off_led);
+ local_trigger();
+ }
+}
+
+static const struct block_device_operations if_blkdev_ops = {
+ .owner = THIS_MODULE,
+ .open = if_open,
+ .release = if_release,
+
+};
+
+/*************** informational * statistics **************/
+
+static
+char *if_statistics(struct if_brick *brick, int verbose)
+{
+ struct if_input *input = brick->inputs[0];
+ char *res = brick_string_alloc(512);
+ int tmp0 = atomic_read(&input->total_reada_count);
+ int tmp1 = atomic_read(&input->total_read_count);
+ int tmp2 = atomic_read(&input->total_aio_read_count);
+ int tmp3 = atomic_read(&input->total_write_count);
+ int tmp4 = atomic_read(&input->total_aio_write_count);
+
+ snprintf(res, 512,
+ "total reada = %d reads = %d aio_reads = %d (%d%%) writes = %d aio_writes = %d (%d%%) empty = %d fired = %d skip_sync = %d | plugged = %d flying = %d (reads = %d writes = %d)\n",
+ tmp0,
+ tmp1,
+ tmp2,
+ tmp1 ? tmp2 * 100 / tmp1 : 0,
+ tmp3,
+ tmp4,
+ tmp3 ? tmp4 * 100 / tmp3 : 0,
+ atomic_read(&input->total_empty_count),
+ atomic_read(&input->total_fire_count),
+ atomic_read(&input->total_skip_sync_count),
+ atomic_read(&input->plugged_count),
+ atomic_read(&input->flying_count),
+ atomic_read(&input->read_flying_count),
+ atomic_read(&input->write_flying_count));
+ return res;
+}
+
+static
+void if_reset_statistics(struct if_brick *brick)
+{
+ struct if_input *input = brick->inputs[0];
+
+ atomic_set(&input->total_read_count, 0);
+ atomic_set(&input->total_write_count, 0);
+ atomic_set(&input->total_empty_count, 0);
+ atomic_set(&input->total_fire_count, 0);
+ atomic_set(&input->total_skip_sync_count, 0);
+ atomic_set(&input->total_aio_read_count, 0);
+ atomic_set(&input->total_aio_write_count, 0);
+}
+
+/***************** own brick * input * output operations *****************/
+
+/* none */
+
+/*************** object * aspect constructors * destructors **************/
+
+static int if_aio_aspect_init_fn(struct generic_aspect *_ini)
+{
+ struct if_aio_aspect *ini = (void *)_ini;
+
+ INIT_LIST_HEAD(&ini->plug_head);
+ INIT_LIST_HEAD(&ini->hash_head);
+ return 0;
+}
+
+static void if_aio_aspect_exit_fn(struct generic_aspect *_ini)
+{
+ struct if_aio_aspect *ini = (void *)_ini;
+
+ CHECK_HEAD_EMPTY(&ini->plug_head);
+ CHECK_HEAD_EMPTY(&ini->hash_head);
+}
+
+XIO_MAKE_STATICS(if);
+
+/*********************** constructors * destructors ***********************/
+
+static int if_brick_construct(struct if_brick *brick)
+{
+ sema_init(&brick->switch_sem, 1);
+ atomic_set(&brick->open_count, 0);
+ return 0;
+}
+
+static int if_brick_destruct(struct if_brick *brick)
+{
+ return 0;
+}
+
+static int if_input_construct(struct if_input *input)
+{
+ int i;
+
+ input->hash_table = brick_block_alloc(0, PAGE_SIZE);
+ for (i = 0; i < IF_HASH_MAX; i++) {
+ spin_lock_init(&input->hash_table[i].hash_lock);
+ INIT_LIST_HEAD(&input->hash_table[i].hash_anchor);
+ }
+ INIT_LIST_HEAD(&input->plug_anchor);
+ sema_init(&input->kick_sem, 1);
+ spin_lock_init(&input->req_lock);
+ atomic_set(&input->flying_count, 0);
+ atomic_set(&input->read_flying_count, 0);
+ atomic_set(&input->write_flying_count, 0);
+ atomic_set(&input->plugged_count, 0);
+ return 0;
+}
+
+static int if_input_destruct(struct if_input *input)
+{
+ int i;
+
+ for (i = 0; i < IF_HASH_MAX; i++)
+ CHECK_HEAD_EMPTY(&input->hash_table[i].hash_anchor);
+ CHECK_HEAD_EMPTY(&input->plug_anchor);
+ brick_block_free(input->hash_table, PAGE_SIZE);
+ return 0;
+}
+
+static int if_output_construct(struct if_output *output)
+{
+ return 0;
+}
+
+/************************ static structs ***********************/
+
+static struct if_brick_ops if_brick_ops = {
+ .brick_switch = if_switch,
+ .brick_statistics = if_statistics,
+ .reset_statistics = if_reset_statistics,
+};
+
+static struct if_output_ops if_output_ops;
+
+const struct if_input_type if_input_type = {
+ .type_name = "if_input",
+ .input_size = sizeof(struct if_input),
+ .input_construct = &if_input_construct,
+ .input_destruct = &if_input_destruct,
+};
+
+static const struct if_input_type *if_input_types[] = {
+ &if_input_type,
+};
+
+const struct if_output_type if_output_type = {
+ .type_name = "if_output",
+ .output_size = sizeof(struct if_output),
+ .master_ops = &if_output_ops,
+ .output_construct = &if_output_construct,
+};
+
+static const struct if_output_type *if_output_types[] = {
+ &if_output_type,
+};
+
+const struct if_brick_type if_brick_type = {
+ .type_name = "if_brick",
+ .brick_size = sizeof(struct if_brick),
+ .max_inputs = 1,
+ .max_outputs = 0,
+ .master_ops = &if_brick_ops,
+ .aspect_types = if_aspect_types,
+ .default_input_types = if_input_types,
+ .default_output_types = if_output_types,
+ .brick_construct = &if_brick_construct,
+ .brick_destruct = &if_brick_destruct,
+};
+EXPORT_SYMBOL_GPL(if_brick_type);
+
+/***************** module init stuff ************************/
+
+void exit_xio_if(void)
+{
+ int status;
+
+ XIO_INF("exit_if()\n");
+ status = if_unregister_brick_type();
+ unregister_blkdev(XIO_MAJOR, "xio");
+}
+
+int __init init_xio_if(void)
+{
+ int status;
+
+ (void)if_aspect_types; /* not used, shut up gcc */
+
+ XIO_INF("init_if()\n");
+ status = register_blkdev(XIO_MAJOR, "xio");
+ if (status)
+ return status;
+ status = if_register_brick_type();
+ if (status)
+ goto err_device;
+ return status;
+err_device:
+ XIO_ERR("init_if() status=%d\n", status);
+ exit_xio_if();
+ return status;
+}
--
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/