[PATCH 01/19] raid5: raid5_do_soft_block_ops

From: Dan Williams
Date: Mon Sep 11 2006 - 19:14:36 EST


From: Dan Williams <dan.j.williams@xxxxxxxxx>

raid5_do_soft_block_ops consolidates all the stripe cache maintenance
operations into a single routine. The stripe operations are:
* copying data between the stripe cache and user application buffers
* computing blocks to save a disk access, or to recover a missing block
* updating the parity on a write operation (reconstruct write and
read-modify-write)
* checking parity correctness

Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---

drivers/md/raid5.c | 289 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/raid/raid5.h | 129 +++++++++++++++++++-
2 files changed, 415 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4500660..8fde62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1362,6 +1362,295 @@ static int stripe_to_pdidx(sector_t stri
return pd_idx;
}

+/*
+ * raid5_do_soft_block_ops - perform block memory operations on stripe data
+ * outside the spin lock.
+ */
+static void raid5_do_soft_block_ops(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+ int i, pd_idx = sh->pd_idx, disks = sh->disks;
+ void *ptr[MAX_XOR_BLOCKS];
+ int overlap=0, work=0, written=0, compute=0, dd_idx=0;
+ int pd_uptodate=0;
+ unsigned long state, ops_state, ops_state_orig;
+ raid5_conf_t *conf = sh->raid_conf;
+
+ /* take a snapshot of what needs to be done at this point in time */
+ spin_lock(&sh->lock);
+ state = sh->state;
+ ops_state_orig = ops_state = sh->ops.state;
+ spin_unlock(&sh->lock);
+
+ if (test_bit(STRIPE_OP_BIOFILL, &state)) {
+ struct bio *return_bi=NULL;
+
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_ReadReq, &dev->flags)) {
+ struct bio *rbi, *rbi2;
+ PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx disk: %d\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state, i);
+ spin_lock_irq(&conf->device_lock);
+ rbi = dev->toread;
+ dev->toread = NULL;
+ spin_unlock_irq(&conf->device_lock);
+ overlap++;
+ while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ copy_data(0, rbi, dev->page, dev->sector);
+ rbi2 = r5_next_bio(rbi, dev->sector);
+ spin_lock_irq(&conf->device_lock);
+ if (--rbi->bi_phys_segments == 0) {
+ rbi->bi_next = return_bi;
+ return_bi = rbi;
+ }
+ spin_unlock_irq(&conf->device_lock);
+ rbi = rbi2;
+ }
+ dev->read = return_bi;
+ }
+ }
+ if (overlap) {
+ set_bit(STRIPE_OP_BIOFILL_Done, &ops_state);
+ work++;
+ }
+ }
+
+ if (test_bit(STRIPE_OP_COMPUTE, &state)) {
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_ComputeReq, &dev->flags)) {
+ dd_idx = i;
+ i = -1;
+ break;
+ }
+ }
+ BUG_ON(i >= 0);
+ PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: %d\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state, dd_idx);
+ ptr[0] = page_address(sh->dev[dd_idx].page);
+
+ if (test_and_clear_bit(STRIPE_OP_COMPUTE_Prep, &ops_state)) {
+ memset(ptr[0], 0, STRIPE_SIZE);
+ set_bit(STRIPE_OP_COMPUTE_Parity, &ops_state);
+ }
+
+ if (test_and_clear_bit(STRIPE_OP_COMPUTE_Parity, &ops_state)) {
+ int count = 1;
+ for (i = disks ; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ void *p;
+ if (i == dd_idx)
+ continue;
+ p = page_address(dev->page);
+ ptr[count++] = p;
+
+ check_xor();
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+
+ work++;
+ compute++;
+ set_bit(STRIPE_OP_COMPUTE_Done, &ops_state);
+ }
+ }
+
+ if (test_bit(STRIPE_OP_RMW, &state)) {
+ BUG_ON(test_bit(STRIPE_OP_RCW, &state));
+
+ PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state);
+
+ ptr[0] = page_address(sh->dev[pd_idx].page);
+
+ if (test_and_clear_bit(STRIPE_OP_RMW_ParityPre, &ops_state)) {
+ int count = 1;
+
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ struct bio *chosen;
+
+ /* Only process blocks that are known to be uptodate */
+ if (dev->towrite && test_bit(R5_RMWReq, &dev->flags)) {
+ ptr[count++] = page_address(dev->page);
+
+ spin_lock(&sh->lock);
+ chosen = dev->towrite;
+ dev->towrite = NULL;
+ BUG_ON(dev->written);
+ dev->written = chosen;
+ spin_unlock(&sh->lock);
+
+ overlap++;
+
+ check_xor();
+ }
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+ set_bit(STRIPE_OP_RMW_Drain, &ops_state);
+ }
+ if (test_and_clear_bit(STRIPE_OP_RMW_Drain, &ops_state)) {
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ struct bio *wbi = dev->written;
+
+ if (dev->written)
+ written++;
+
+ while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ copy_data(1, wbi, dev->page, dev->sector);
+ wbi = r5_next_bio(wbi, dev->sector);
+ }
+ }
+ set_bit(STRIPE_OP_RMW_ParityPost, &ops_state);
+ }
+ if (test_and_clear_bit(STRIPE_OP_RMW_ParityPost, &ops_state)) {
+ int count = 1;
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (dev->written) {
+ ptr[count++] = page_address(dev->page);
+ check_xor();
+ }
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+
+ work++;
+ pd_uptodate++;
+ set_bit(STRIPE_OP_RMW_Done, &ops_state);
+ }
+
+ }
+
+ if (test_bit(STRIPE_OP_RCW, &state)) {
+ BUG_ON(test_bit(STRIPE_OP_RMW, &state));
+
+ PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state);
+
+ ptr[0] = page_address(sh->dev[pd_idx].page);
+
+ if (test_and_clear_bit(STRIPE_OP_RCW_Drain, &ops_state)) {
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ struct bio *chosen;
+ struct bio *wbi;
+
+ if (i!=pd_idx && dev->towrite &&
+ test_bit(R5_LOCKED, &dev->flags)) {
+
+ spin_lock(&sh->lock);
+ chosen = dev->towrite;
+ dev->towrite = NULL;
+ spin_unlock(&sh->lock);
+
+ BUG_ON(dev->written);
+ wbi = dev->written = chosen;
+
+ overlap++;
+ written++;
+
+ while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ copy_data(1, wbi, dev->page, dev->sector);
+ wbi = r5_next_bio(wbi, dev->sector);
+ }
+ } else if (i==pd_idx)
+ memset(ptr[0], 0, STRIPE_SIZE);
+ }
+ set_bit(STRIPE_OP_RCW_Parity, &ops_state);
+ }
+ if (test_and_clear_bit(STRIPE_OP_RCW_Parity, &ops_state)) {
+ int count = 1;
+ for (i=disks; i--;)
+ if (i != pd_idx) {
+ ptr[count++] = page_address(sh->dev[i].page);
+ check_xor();
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+
+ work++;
+ pd_uptodate++;
+ set_bit(STRIPE_OP_RCW_Done, &ops_state);
+
+ }
+ }
+
+ if (test_bit(STRIPE_OP_CHECK, &state)) {
+ PRINTK("%s: stripe %llu STRIPE_OP_CHECK op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state);
+
+ ptr[0] = page_address(sh->dev[pd_idx].page);
+
+ if (test_and_clear_bit(STRIPE_OP_CHECK_Gen, &ops_state)) {
+ int count = 1;
+ for (i=disks; i--;)
+ if (i != pd_idx) {
+ ptr[count++] = page_address(sh->dev[i].page);
+ check_xor();
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+
+ set_bit(STRIPE_OP_CHECK_Verify, &ops_state);
+ }
+ if (test_and_clear_bit(STRIPE_OP_CHECK_Verify, &ops_state)) {
+ if (page_is_zero(sh->dev[pd_idx].page))
+ set_bit(STRIPE_OP_CHECK_IsZero, &ops_state);
+
+ work++;
+ set_bit(STRIPE_OP_CHECK_Done, &ops_state);
+ }
+ }
+
+ spin_lock(&sh->lock);
+ /* Update the state of operations:
+ * -clear incoming requests
+ * -preserve output status (i.e. done status / check result)
+ * -preserve requests added since 'ops_state_orig' was set
+ */
+ sh->ops.state ^= (ops_state_orig & ~STRIPE_OP_COMPLETION_MASK);
+ sh->ops.state |= ops_state;
+
+ if (pd_uptodate)
+ set_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+
+ if (written)
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (dev->written)
+ set_bit(R5_UPTODATE, &dev->flags);
+ }
+
+ if (overlap)
+ for (i= disks; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_and_clear_bit(R5_Overlap, &dev->flags))
+ wake_up(&sh->raid_conf->wait_for_overlap);
+ }
+
+ if (compute) {
+ clear_bit(R5_ComputeReq, &sh->dev[dd_idx].flags);
+ set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+ }
+
+ sh->ops.pending -= work;
+ BUG_ON(sh->ops.pending < 0);
+ clear_bit(STRIPE_OP_QUEUED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ queue_raid_work(sh);
+ spin_unlock(&sh->lock);
+
+ release_stripe(sh);
+}

/*
* handle_stripe - do things to a stripe.
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 20ed4c9..c8a315b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -116,13 +116,39 @@ #include <linux/raid/xor.h>
* attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe())
- * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io/ops needed unlockstripe schedule io/ops
* release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
*
* The refcount counts each thread that have activated the stripe,
* plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer.
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and read-modify-write)
+ * -checking parity correctness
+ * These operations are carried out by either a software routine,
+ * raid5_do_soft_block_ops, or by a routine that arranges for the work to be
+ * done by dedicated DMA engines.
+ * When requesting an operation handle_stripe sets the proper state and work
+ * request flags, it then hands control to the operations routine. There are
+ * some critical dependencies between the operations that prevent some
+ * operations from being requested while another is in flight.
+ * Here are the inter-dependencies:
+ * -parity check operations destroy the in cache version of the parity block,
+ * so we prevent parity dependent operations like writes and compute_blocks
+ * from starting while a check is in progress.
+ * -when a write operation is requested we immediately lock the affected blocks,
+ * and mark them as not up to date. This causes new read requests to be held
+ * off, as well as parity checks and compute block operations.
+ * -once a compute block operation has been requested handle_stripe treats that
+ * block as if it is immediately up to date. The routine carrying out the
+ * operation guaruntees that any operation that is dependent on the
+ * compute block result is initiated after the computation completes.
*/

struct stripe_head {
@@ -136,11 +162,18 @@ struct stripe_head {
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
+ struct stripe_operations {
+ int pending; /* number of operations requested */
+ unsigned long state; /* state of block operations */
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ struct work_struct work; /* work queue descriptor */
+ #endif
+ } ops;
struct r5dev {
struct bio req;
struct bio_vec vec;
struct page *page;
- struct bio *toread, *towrite, *written;
+ struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
} dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -158,6 +191,11 @@ #define R5_ReadError 8 /* seen a read er
#define R5_ReWrite 9 /* have tried to over-write the readerror */

#define R5_Expanded 10 /* This block now has post-expand data */
+#define R5_Consistent 11 /* Block is HW DMA-able without a cache flush */
+#define R5_ComputeReq 12 /* compute_block in progress treat as uptodate */
+#define R5_ReadReq 13 /* dev->toread contains a bio that needs filling */
+#define R5_RMWReq 14 /* distinguish blocks ready for rmw from other "towrites" */
+
/*
* Write method
*/
@@ -179,6 +217,72 @@ #define STRIPE_BIT_DELAY 8
#define STRIPE_EXPANDING 9
#define STRIPE_EXPAND_SOURCE 10
#define STRIPE_EXPAND_READY 11
+#define STRIPE_OP_RCW 12
+#define STRIPE_OP_RMW 13 /* RAID-5 only */
+#define STRIPE_OP_UPDATE 14 /* RAID-6 only */
+#define STRIPE_OP_CHECK 15
+#define STRIPE_OP_COMPUTE 16
+#define STRIPE_OP_COMPUTE2 17 /* RAID-6 only */
+#define STRIPE_OP_BIOFILL 18
+#define STRIPE_OP_QUEUED 19
+#define STRIPE_OP_DMA 20
+
+/*
+ * These flags are communication markers between the handle_stripe[5|6]
+ * routine and the block operations work queue
+ * - The *_Done definitions signal completion from work queue to handle_stripe
+ * - STRIPE_OP_CHECK_IsZero: signals parity correctness to handle_stripe
+ * - STRIPE_OP_RCW_Expand: expansion operations perform a modified RCW sequence
+ * - STRIPE_OP_COMPUTE_Recover_pd: recovering the parity disk involves an extra
+ * write back step
+ * - STRIPE_OP_*_Dma: flag operations that will be done once the DMA engine
+ * goes idle
+ * - All other definitions are service requests for the work queue
+ */
+#define STRIPE_OP_RCW_Drain 0
+#define STRIPE_OP_RCW_Parity 1
+#define STRIPE_OP_RCW_Done 2
+#define STRIPE_OP_RCW_Expand 3
+#define STRIPE_OP_RMW_ParityPre 4
+#define STRIPE_OP_RMW_Drain 5
+#define STRIPE_OP_RMW_ParityPost 6
+#define STRIPE_OP_RMW_Done 7
+#define STRIPE_OP_CHECK_Gen 8
+#define STRIPE_OP_CHECK_Verify 9
+#define STRIPE_OP_CHECK_Done 10
+#define STRIPE_OP_CHECK_IsZero 11
+#define STRIPE_OP_COMPUTE_Prep 12
+#define STRIPE_OP_COMPUTE_Parity 13
+#define STRIPE_OP_COMPUTE_Done 14
+#define STRIPE_OP_COMPUTE_Recover_pd 15
+#define STRIPE_OP_BIOFILL_Copy 16
+#define STRIPE_OP_BIOFILL_Done 17
+#define STRIPE_OP_RCW_Dma 18
+#define STRIPE_OP_RMW_Dma 19
+#define STRIPE_OP_UPDATE_Dma 20
+#define STRIPE_OP_CHECK_Dma 21
+#define STRIPE_OP_COMPUTE_Dma 22
+#define STRIPE_OP_COMPUTE2_Dma 23
+#define STRIPE_OP_BIOFILL_Dma 24
+
+/*
+ * Bit mask for status bits not to be auto-cleared by the work queue thread
+ */
+#define STRIPE_OP_COMPLETION_MASK (1 << STRIPE_OP_RCW_Done |\
+ 1 << STRIPE_OP_RMW_Done |\
+ 1 << STRIPE_OP_CHECK_Done |\
+ 1 << STRIPE_OP_CHECK_IsZero |\
+ 1 << STRIPE_OP_COMPUTE_Done |\
+ 1 << STRIPE_OP_COMPUTE_Recover_pd |\
+ 1 << STRIPE_OP_BIOFILL_Done |\
+ 1 << STRIPE_OP_RCW_Dma |\
+ 1 << STRIPE_OP_RMW_Dma |\
+ 1 << STRIPE_OP_UPDATE_Dma |\
+ 1 << STRIPE_OP_CHECK_Dma |\
+ 1 << STRIPE_OP_COMPUTE_Dma |\
+ 1 << STRIPE_OP_COMPUTE2_Dma |\
+ 1 << STRIPE_OP_BIOFILL_Dma)
+
/*
* Plugging:
*
@@ -229,11 +333,19 @@ struct raid5_private_data {
atomic_t preread_active_stripes; /* stripes with scheduled io */

atomic_t reshape_stripes; /* stripes with pending writes for reshape */
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ struct workqueue_struct *block_ops_queue;
+ #endif
+ void (*do_block_ops)(void *);
+
/* unfortunately we need two cache names as we temporarily have
* two caches.
*/
int active_name;
char cache_name[2][20];
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ char workqueue_name[20];
+ #endif
kmem_cache_t *slab_cache; /* for allocating stripes */

int seq_flush, seq_write;
@@ -264,6 +376,17 @@ struct raid5_private_data {
typedef struct raid5_private_data raid5_conf_t;

#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+/* must be called under the stripe lock */
+static inline void queue_raid_work(struct stripe_head *sh)
+{
+ if (sh->ops.pending != 0 && !test_bit(STRIPE_OP_QUEUED, &sh->state)) {
+ set_bit(STRIPE_OP_QUEUED, &sh->state);
+ atomic_inc(&sh->count);
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
+ #endif
+ }
+}

/*
* Our supported algorithms
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/