[PATCH 002 of 006] raid5: Move check parity operations to a workqueue

From: Dan Williams
Date: Wed Jun 28 2006 - 14:22:36 EST


This patch adds 'check parity' capabilities to the work queue and fixes
'queue_raid_work'.

Also, raid5_do_soft_block_ops now accesses the stripe state under the
lock to ensure that it is never out of sync with handle_stripe5.

Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>

drivers/md/raid5.c | 123 ++++++++++++++++++++++++++++++++++-----------
include/linux/raid/raid5.h | 25 ++++++---
2 files changed, 113 insertions(+), 35 deletions(-)

===================================================================
Index: linux-2.6-raid/drivers/md/raid5.c
===================================================================
--- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 09:52:07.000000000 -0700
+++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 10:35:23.000000000 -0700
@@ -1289,7 +1289,7 @@
if (locked > 0) {
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
- sh->ops.queue_count++;
+ sh->ops.pending++;
} else if (locked == 0)
set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);

@@ -1300,6 +1300,37 @@
return locked;
}

+static int handle_check_operations5(struct stripe_head *sh, int start_n)
+{
+ int complete=0, work_queued = -EBUSY;
+
+ if (test_bit(STRIPE_OP_CHECK, &sh->state) &&
+ test_bit(STRIPE_OP_CHECK_Done, &sh->ops.state)) {
+ clear_bit(STRIPE_OP_CHECK, &sh->state);
+ clear_bit(STRIPE_OP_CHECK_Done, &sh->ops.state);
+ complete = 1;
+ }
+
+ if (start_n == 0) {
+ /* enter stage 1 of parity check operation */
+ set_bit(STRIPE_OP_CHECK, &sh->state);
+ set_bit(STRIPE_OP_CHECK_Gen, &sh->ops.state);
+ work_queued = 1;
+ } else if (complete)
+ work_queued = 0;
+
+ if (work_queued > 0) {
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ sh->ops.pending++;
+ }
+
+ PRINTK("%s: stripe %llu start: %d complete: %d op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ start_n == 0, complete, sh->ops.state);
+
+ return work_queued;
+}
+

/*
* Each stripe/dev can have one or more bion attached.
@@ -1406,11 +1437,11 @@
/* must be called under the stripe lock */
static void queue_raid_work(struct stripe_head *sh)
{
- if (--sh->ops.queue_count == 0) {
+ if (!test_bit(STRIPE_OP_QUEUED, &sh->state) && sh->ops.pending) {
+ set_bit(STRIPE_OP_QUEUED, &sh->state);
atomic_inc(&sh->count);
queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
- } else if (sh->ops.queue_count < 0)
- sh->ops.queue_count = 0;
+ }
}

/*
@@ -1423,16 +1454,17 @@
int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
void *ptr[MAX_XOR_BLOCKS];
struct bio *chosen;
- int overlap=0, new_work=0, written=0;
- unsigned long state, ops_state;
+ int overlap=0, work=0, written=0;
+ unsigned long state, ops_state, ops_state_orig;

/* take a snapshot of what needs to be done at this point in time */
spin_lock(&sh->lock);
state = sh->state;
- ops_state = sh->ops.state;
+ ops_state_orig = ops_state = sh->ops.state;
spin_unlock(&sh->lock);

if (test_bit(STRIPE_OP_RMW, &state)) {
+ BUG_ON(test_bit(STRIPE_OP_RCW, &state));
PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
__FUNCTION__, (unsigned long long)sh->sector,
ops_state);
@@ -1483,14 +1515,14 @@
if (count != 1)
xor_block(count, STRIPE_SIZE, ptr);

- /* signal completion and acknowledge the last state seen
- * by sh->ops.state
- */
+ work++;
set_bit(STRIPE_OP_RMW_Done, &ops_state);
- set_bit(STRIPE_OP_RMW_ParityPre, &ops_state);
}

- } else if (test_bit(STRIPE_OP_RCW, &state)) {
+ }
+
+ if (test_bit(STRIPE_OP_RCW, &state)) {
+ BUG_ON(test_bit(STRIPE_OP_RMW, &state));
PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n",
__FUNCTION__, (unsigned long long)sh->sector,
ops_state);
@@ -1527,20 +1559,47 @@
if (count != 1)
xor_block(count, STRIPE_SIZE, ptr);

- /* signal completion and acknowledge the last state seen
- * by sh->ops.state
- */
+ work++;
set_bit(STRIPE_OP_RCW_Done, &ops_state);
- set_bit(STRIPE_OP_RCW_Drain, &ops_state);

}
}

+ if (test_bit(STRIPE_OP_CHECK, &state)) {
+ PRINTK("%s: stripe %llu STRIPE_OP_CHECK op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ ops_state);
+
+ ptr[0] = page_address(sh->dev[pd_idx].page);
+
+ if (test_and_clear_bit(STRIPE_OP_CHECK_Gen, &ops_state)) {
+ for (i=disks; i--;)
+ if (i != pd_idx) {
+ ptr[count++] = page_address(sh->dev[i].page);
+ check_xor();
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+
+ set_bit(STRIPE_OP_CHECK_Verify, &ops_state);
+ }
+ if (test_and_clear_bit(STRIPE_OP_CHECK_Verify, &ops_state)) {
+ if (page_is_zero(sh->dev[pd_idx].page))
+ set_bit(STRIPE_OP_CHECK_IsZero, &ops_state);
+
+ work++;
+ set_bit(STRIPE_OP_CHECK_Done, &ops_state);
+ }
+ }
+
spin_lock(&sh->lock);
- /* Update the state of operations, by XORing we clear the stage 1 requests
- * while preserving new requests.
+ /* Update the state of operations:
+ * -clear incoming requests
+ * -preserve output status (i.e. done status / check result)
+ * -preserve requests added since 'ops_state_orig' was set
*/
- sh->ops.state ^= ops_state;
+ sh->ops.state ^= (ops_state_orig & ~STRIPE_OP_COMPLETION_MASK);
+ sh->ops.state |= ops_state;

if (written)
for (i=disks ; i-- ;) {
@@ -1556,7 +1615,8 @@
wake_up(&sh->raid_conf->wait_for_overlap);
}

- sh->ops.queue_count += new_work;
+ sh->ops.pending -= work;
+ clear_bit(STRIPE_OP_QUEUED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
queue_raid_work(sh);
spin_unlock(&sh->lock);
@@ -1941,17 +2001,24 @@
* Any reads will already have been scheduled, so we just see if enough data
* is available
*/
- if (syncing && locked == 0 &&
- !test_bit(STRIPE_INSYNC, &sh->state)) {
+ if ((syncing && locked == 0 &&
+ !test_bit(STRIPE_INSYNC, &sh->state)) ||
+ test_bit(STRIPE_OP_CHECK, &sh->state)) {
+ int work_queued = 0, result = 0;
+
set_bit(STRIPE_HANDLE, &sh->state);
if (failed == 0) {
- BUG_ON(uptodate != disks);
- compute_parity5(sh, CHECK_PARITY);
- uptodate--;
- if (page_is_zero(sh->dev[sh->pd_idx].page)) {
+ BUG_ON(!test_bit(STRIPE_OP_CHECK, &sh->state) &&
+ (uptodate != disks));
+ work_queued = handle_check_operations5(sh,
+ uptodate == disks);
+ result = test_and_clear_bit(STRIPE_OP_CHECK_IsZero, &sh->ops.state);
+ if (work_queued > 0) {
+ uptodate--;
+ } else if (result && work_queued == 0) {
/* parity is correct (on disc, not in buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
- } else {
+ } else if (!result && work_queued == 0) {
conf->mddev->resync_mismatches += STRIPE_SECTORS;
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
@@ -1962,7 +2029,7 @@
}
}
}
- if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+ if (!test_bit(STRIPE_INSYNC, &sh->state) && work_queued == 0) {
/* either failed parity check, or recovery is happening */
if (failed==0)
failed_num = sh->pd_idx;
Index: linux-2.6-raid/include/linux/raid/raid5.h
===================================================================
--- linux-2.6-raid.orig/include/linux/raid/raid5.h 2006-06-28 10:34:54.000000000 -0700
+++ linux-2.6-raid/include/linux/raid/raid5.h 2006-06-28 10:35:23.000000000 -0700
@@ -147,7 +147,7 @@
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
struct stripe_operations {
- int queue_count; /* if == 0 places stripe in the workqueue */
+ int pending; /* number of operations requested */
unsigned long state; /* state of block operations */
struct work_struct work; /* work queue descriptor */
#ifdef CONFIG_DMA_ENGINE
@@ -208,14 +208,16 @@
#define STRIPE_OP_COMPUTE 16
#define STRIPE_OP_COMPUTE2 17 /* RAID-6 only */
#define STRIPE_OP_BIOFILL 18
+#define STRIPE_OP_QUEUED 19

/*
* These flags are communication markers between the handle_stripe[5|6]
* routine and the block operations work queue
- * - The _End definitions are a signal from handle_stripe to the work queue to
+ * - The *_End definitions are a signal from handle_stripe to the work queue to
* to ensure the completion of the operation so the results can be committed
* to disk
- * - The _Done definitions signal completion from work queue to handle_stripe
+ * - The *_Done definitions signal completion from work queue to handle_stripe
+ * - STRIPE_OP_CHECK_IsZero signals parity correctness to handle_stripe
* - All other definitions are service requests for the work queue
*/
#define STRIPE_OP_RCW_Drain 0
@@ -231,12 +233,21 @@
#define STRIPE_OP_CHECK_Verify 10
#define STRIPE_OP_CHECK_End 11
#define STRIPE_OP_CHECK_Done 12
-#define STRIPE_OP_COMPUTE_Prep 13
-#define STRIPE_OP_COMPUTE_Parity 14
-#define STRIPE_OP_COMPUTE_End 15
-#define STRIPE_OP_COMPUTE_Done 16
+#define STRIPE_OP_CHECK_IsZero 13
+#define STRIPE_OP_COMPUTE_Prep 14
+#define STRIPE_OP_COMPUTE_Parity 15
+#define STRIPE_OP_COMPUTE_End 16
+#define STRIPE_OP_COMPUTE_Done 17

/*
+ * Bit mask for status bits set by the work queue thread
+ */
+#define STRIPE_OP_COMPLETION_MASK (1 << STRIPE_OP_RCW_Done |\
+ 1 << STRIPE_OP_RMW_Done |\
+ 1 << STRIPE_OP_CHECK_Done |\
+ 1 << STRIPE_OP_CHECK_IsZero |\
+ 1 << STRIPE_OP_COMPUTE_Done)
+/*
* Plugging:
*
* To improve write throughput, we need to delay the handling of some
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/