Re: Patch for block write clustering

Emil Briggs (briggs@bucky.physics.ncsu.edu)
Wed, 4 Mar 1998 10:51:54 -0500 (EST)


>> This is a patch to implement clustered writes in bdflush. It sorts dirty
>> blocks by device and block number so that the writes are done sequentially --
>> the idea being to reduce seek times for the disk heads.
>
>I've read the patch, and it looks like it implements the sorting
>in <linux/blk.h> on a higher level (where it really should belong),
>but it still suffers from the same bug:
>

Yes, I started looking through the lower-level code. I am puzzled though
as to why the low-level sorting doesn't seem to have much effect in 2.0.33
at least. Using a stock 2.0.33 kernel with write-caching turned off on the
drive itself causes a huge performance hit. (I can post the benchmark program
I used if anyone wants to try it themselves)

>
>This algoritm has the distict disadvantage of flushing
>hda before hdb, and hdc (before hdd) even later.

Thats a good point and I guess you could run into problems under a heavy
io load to multiple disks.

>> Obviously if you have a drive that does write caching you don't
>> want this patch.
>
>Yes you do, when you have loads of partitions, loads of
>I/O and use the sorting scheme I sketched above, you
>will notice a bigger advantage. I can hardly wait to
>see your sorting (with the 'new' scheme) in 2.1.89
>bdflush...
>

I'll put it in -- the patch needs quite a bit of working. I don't have a system
with more than one drive on it that I can use for testing this sort of stuff though.

>> The patch assumes that the bdflush parameters are set to their default
>> values and things might break if thats not the case, that needs to be
>> fixed.
>
>I haven't applied it yet (only read it) to 2.1.89pre-5
>(it looks somewhat valid for 2.1 too), but it looks
>like you ran patch in reverse...

Yes I did. Sorry about that -- the correct patch follows.

Emil

--- linux/fs/buffer.c.orig Tue Mar 3 20:06:17 1998
+++ linux/fs/buffer.c Tue Mar 3 21:21:02 1998
@@ -47,6 +47,8 @@
#define HASH_MASK (NR_HASH-1)

static int grow_buffers(int pri, int size);
+static int shellsort(struct buffer_head *bh, struct buffer_head **bhptrs, int maxbh);
+

static struct buffer_head ** hash_table;
static struct buffer_head * lru_list[NR_LIST] = {NULL, };
@@ -1645,12 +1647,24 @@
* 3) Quit writing loop blocks if a freelist went low (avoids deadlock
* with running out of free buffers for loop's "real" device).
*/
+
+/* Write sorting added 02-28-98 - Emil Briggs
+
+ Need to make the sorting size dynamically adjustable.
+
+*/
+
+#define IO_CLUSTER_SIZE 500
+struct buffer_head *bhptrs[IO_CLUSTER_SIZE];
+
+
int bdflush(void * unused)
{
int i;
+ int idx;
int ndirty;
int nlist;
- int ncount;
+ int ncount, sortcount;
struct buffer_head * bh, *next;
int major;
int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
@@ -1692,53 +1706,85 @@
ndirty = 0;
refilled = 0;
repeat:
- allow_interrupts();

- bh = lru_list[nlist];
- if(bh)
- for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
- bh = next) {
- /* We may have stalled while waiting for I/O to complete. */
- if(bh->b_list != nlist) goto repeat;
- next = bh->b_next_free;
- if(!lru_list[nlist]) {
- printk("Dirty list empty %d\n", i);
- break;
- }
+ bh = lru_list[nlist];
+
+ /* I/O clustering implemented to reduce seeks. We use another level of indirection
+ and create a sorted list of buffers here. Blocks are sorted by device and blocknr */
+
+ sortcount = shellsort(bh, bhptrs, IO_CLUSTER_SIZE);
+ bh = bhptrs[0];
+ idx = 0;
+
+ for (i = 0;i < nr_buffers_type[nlist]; i++) {
+
+
+ if( idx >= sortcount ) goto repeat;
+
+ if( !bh || (ndirty >= bdf_prm.b_un.ndirty) ) break;
+
+
+ bh = bhptrs[idx];
+ idx++;
+
+
+ /* We may have stalled while waiting for I/O to complete. */
+ if(bh->b_list != nlist) goto repeat;
+ next = bh->b_next_free;
+ if(!lru_list[nlist]) {
+
+ printk("Dirty list empty %d\n", i);
+ break;
+ }

- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
- {
- refile_buffer(bh);
- continue;
- }
+
+ /* Clean buffer on dirty list? Refile it */
+ if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
+
+ refile_buffer(bh);
+ continue;
+
+ }

- if (buffer_locked(bh) || !buffer_dirty(bh))
- continue;
- major = MAJOR(bh->b_dev);
- /* Should we write back buffers that are shared or not??
- currently dirty buffers are not shared, so it does not matter */
- if (refilled && major == LOOP_MAJOR)
- continue;
- next->b_count++;
- bh->b_count++;
- ndirty++;
- bh->b_flushtime = 0;
- if (major == LOOP_MAJOR) {
- ll_rw_block(wrta_cmd,1, &bh);
- wrta_cmd = WRITEA;
- if (buffer_dirty(bh))
- --ndirty;
- }
- else
- ll_rw_block(WRITE, 1, &bh);
+
+ if (buffer_locked(bh) || !buffer_dirty(bh))
+ continue;
+
+ major = MAJOR(bh->b_dev);
+ /* Should we write back buffers that are shared or not??
+ currently dirty buffers are not shared, so it does not matter */
+ if (refilled && major == LOOP_MAJOR)
+ continue;
+
+ next->b_count++;
+ bh->b_count++;
+ ndirty++;
+ bh->b_flushtime = 0;
+
+ if (major == LOOP_MAJOR) {
+
+ ll_rw_block(wrta_cmd,1, &bh);
+ wrta_cmd = WRITEA;
+ if (buffer_dirty(bh))
+ --ndirty;
+ }
+ else {
+
+ ll_rw_block(WRITE, 1, &bh);
+
+ }
+
#ifdef DEBUG
- if(nlist != BUF_DIRTY) ncount++;
+ if(nlist != BUF_DIRTY) ncount++;
#endif
- bh->b_count--;
- next->b_count--;
- }
- }
+ bh->b_count--;
+ next->b_count--;
+
+
+ } /* end for */
+
+
+ } /* end for */
#ifdef DEBUG
if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
printk("sleeping again.\n");
@@ -1764,19 +1810,80 @@
}


-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only. This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 8
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -8
- * c-argdecl-indent: 8
- * c-label-offset: -8
- * c-continued-statement-offset: 8
- * c-continued-brace-offset: 0
- * End:
- */
+static int shellsort(struct buffer_head *bh, struct buffer_head **bhptrs, int maxbh)
+
+{
+
+ int count = 0;
+ int split;
+ int size;
+ int incr;
+ int counter;
+ int offset;
+ struct buffer_head *tptr;
+
+
+ /* Copy buffer_head pointers into bhptrs */
+ while(bh && (count < maxbh) ) {
+
+ bhptrs[count] = bh;
+ bh = bh->b_next_free;
+ count++;
+
+ } /* end while */
+
+
+ if(count <= 1) return count;
+
+ /* Now sort the list */
+ split = count / 2;
+
+
+ incr = 1;
+ size = count - split;
+ while(1) {
+
+ counter = incr - 1;
+ while(1) {
+
+ offset = counter + split;
+ if( (kdev_t_to_nr(bhptrs[counter]->b_dev) >
+ kdev_t_to_nr(bhptrs[offset]->b_dev)) ||
+ ((kdev_t_to_nr(bhptrs[counter]->b_dev) ==
+ kdev_t_to_nr(bhptrs[offset]->b_dev)) &&
+ (bhptrs[counter]->b_blocknr > bhptrs[offset]->b_blocknr))) {
+
+ tptr = bhptrs[counter];
+ bhptrs[counter] = bhptrs[offset];
+ bhptrs[offset] = tptr;
+ counter -= split;
+ if(counter < 0) break;
+
+ }
+ else {
+
+ break;
+
+ } /* end if */
+
+ } /* end while */
+
+
+ incr++;
+ if(incr > size) {
+
+ split /= 2;
+ if( !split ) break;
+ incr = 1;
+ size = count - split;
+
+ } /* end while */
+
+ } /* end while */
+
+
+ return count;
+
+} /* end shellsort */
+
+

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu