Patch for block write clustering

Emil Briggs (briggs@bucky.physics.ncsu.edu)
Tue, 3 Mar 1998 22:18:04 -0500 (EST)


This is a patch to implement clustered writes in bdflush. It sorts dirty
blocks by device and block number so that the writes are done sequentially --
the idea being to reduce seek times for the disk heads.

With an IDE drive I saw a 600% performance improvement on one (admittedly
contrived benchmark). I'm using an IDE rather than my SCSI disk since I
don't have any way to control the write caching on my SCSI drive.

Here are some test results on 2.0.33. For testing I wrote a benchmark
that interleaved writes to two files which were large enough so the
buffers would have to be flushed repeatedly before the test finished.
I did runs with/without the patch and with/without write caching enabled
on the IDE drive.

Drive parameters | Stock 2.0.33 Patched 2.0.33
------------------|-----------------------------------------
hdparm -W 0 -K 0 | 1:30.10elapsed 0:21.77elapsed
hdparm -W 1 -K 0 | 0:14.51elapsed 0:15.44elapsed

Obviously if you have a drive that does write caching you don't
want this patch. Older IDE drives which don't do write caching
could benefit however. I also tried this using a 2.1.88 kernel
but I'm not sure what is going on there. The numbers were all the
same. (Around 15 seconds). Either the low level drivers have been
rewritten to sort requests (I still have to go through that code
and learn how it works) or I was unable to turn write caching off
under 2.1.88. This might be a possibility since I got a kernel
message saying there was a problem with the I/O operation when
I ran hdparm.

The patch assumes that the bdflush parameters are set to their default
values and things might break if thats not the case, that needs to be
fixed.

Anyway here it is -- comments or suggestions are appreciated, I know this
may not be a terribly useful addition but it was fun and I learned quite
a bit from it.

Thanks
Emil Briggs

--- linux/fs/buffer.c Tue Mar 3 21:21:02 1998
+++ linux/fs/buffer.c.orig Tue Mar 3 20:06:17 1998
@@ -47,8 +47,6 @@
#define HASH_MASK (NR_HASH-1)

static int grow_buffers(int pri, int size);
-static int shellsort(struct buffer_head *bh, struct buffer_head **bhptrs, int maxbh);
-

static struct buffer_head ** hash_table;
static struct buffer_head * lru_list[NR_LIST] = {NULL, };
@@ -1647,24 +1645,12 @@
* 3) Quit writing loop blocks if a freelist went low (avoids deadlock
* with running out of free buffers for loop's "real" device).
*/
-
-/* Write sorting added 02-28-98 - Emil Briggs
-
- Need to make the sorting size dynamically adjustable.
-
-*/
-
-#define IO_CLUSTER_SIZE 500
-struct buffer_head *bhptrs[IO_CLUSTER_SIZE];
-
-
int bdflush(void * unused)
{
int i;
- int idx;
int ndirty;
int nlist;
- int ncount, sortcount;
+ int ncount;
struct buffer_head * bh, *next;
int major;
int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
@@ -1706,85 +1692,53 @@
ndirty = 0;
refilled = 0;
repeat:
+ allow_interrupts();

- bh = lru_list[nlist];
-
- /* I/O clustering implemented to reduce seeks. We use another level of indirection
- and create a sorted list of buffers here. Blocks are sorted by device and blocknr */
-
- sortcount = shellsort(bh, bhptrs, IO_CLUSTER_SIZE);
- bh = bhptrs[0];
- idx = 0;
-
- for (i = 0;i < nr_buffers_type[nlist]; i++) {
-
-
- if( idx >= sortcount ) goto repeat;
-
- if( !bh || (ndirty >= bdf_prm.b_un.ndirty) ) break;
-
-
- bh = bhptrs[idx];
- idx++;
-
-
- /* We may have stalled while waiting for I/O to complete. */
- if(bh->b_list != nlist) goto repeat;
- next = bh->b_next_free;
- if(!lru_list[nlist]) {
-
- printk("Dirty list empty %d\n", i);
- break;
- }
+ bh = lru_list[nlist];
+ if(bh)
+ for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
+ bh = next) {
+ /* We may have stalled while waiting for I/O to complete. */
+ if(bh->b_list != nlist) goto repeat;
+ next = bh->b_next_free;
+ if(!lru_list[nlist]) {
+ printk("Dirty list empty %d\n", i);
+ break;
+ }

-
- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
-
- refile_buffer(bh);
- continue;
-
- }
+ /* Clean buffer on dirty list? Refile it */
+ if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
+ {
+ refile_buffer(bh);
+ continue;
+ }

-
- if (buffer_locked(bh) || !buffer_dirty(bh))
- continue;
-
- major = MAJOR(bh->b_dev);
- /* Should we write back buffers that are shared or not??
- currently dirty buffers are not shared, so it does not matter */
- if (refilled && major == LOOP_MAJOR)
- continue;
-
- next->b_count++;
- bh->b_count++;
- ndirty++;
- bh->b_flushtime = 0;
-
- if (major == LOOP_MAJOR) {
-
- ll_rw_block(wrta_cmd,1, &bh);
- wrta_cmd = WRITEA;
- if (buffer_dirty(bh))
- --ndirty;
- }
- else {
-
- ll_rw_block(WRITE, 1, &bh);
-
- }
-
+ if (buffer_locked(bh) || !buffer_dirty(bh))
+ continue;
+ major = MAJOR(bh->b_dev);
+ /* Should we write back buffers that are shared or not??
+ currently dirty buffers are not shared, so it does not matter */
+ if (refilled && major == LOOP_MAJOR)
+ continue;
+ next->b_count++;
+ bh->b_count++;
+ ndirty++;
+ bh->b_flushtime = 0;
+ if (major == LOOP_MAJOR) {
+ ll_rw_block(wrta_cmd,1, &bh);
+ wrta_cmd = WRITEA;
+ if (buffer_dirty(bh))
+ --ndirty;
+ }
+ else
+ ll_rw_block(WRITE, 1, &bh);
#ifdef DEBUG
- if(nlist != BUF_DIRTY) ncount++;
+ if(nlist != BUF_DIRTY) ncount++;
#endif
- bh->b_count--;
- next->b_count--;
-
-
- } /* end for */
-
-
- } /* end for */
+ bh->b_count--;
+ next->b_count--;
+ }
+ }
#ifdef DEBUG
if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
printk("sleeping again.\n");
@@ -1810,80 +1764,19 @@
}


-static int shellsort(struct buffer_head *bh, struct buffer_head **bhptrs, int maxbh)
-
-{
-
- int count = 0;
- int split;
- int size;
- int incr;
- int counter;
- int offset;
- struct buffer_head *tptr;
-
-
- /* Copy buffer_head pointers into bhptrs */
- while(bh && (count < maxbh) ) {
-
- bhptrs[count] = bh;
- bh = bh->b_next_free;
- count++;
-
- } /* end while */
-
-
- if(count <= 1) return count;
-
- /* Now sort the list */
- split = count / 2;
-
-
- incr = 1;
- size = count - split;
- while(1) {
-
- counter = incr - 1;
- while(1) {
-
- offset = counter + split;
- if( (kdev_t_to_nr(bhptrs[counter]->b_dev) >
- kdev_t_to_nr(bhptrs[offset]->b_dev)) ||
- ((kdev_t_to_nr(bhptrs[counter]->b_dev) ==
- kdev_t_to_nr(bhptrs[offset]->b_dev)) &&
- (bhptrs[counter]->b_blocknr > bhptrs[offset]->b_blocknr))) {
-
- tptr = bhptrs[counter];
- bhptrs[counter] = bhptrs[offset];
- bhptrs[offset] = tptr;
- counter -= split;
- if(counter < 0) break;
-
- }
- else {
-
- break;
-
- } /* end if */
-
- } /* end while */
-
-
- incr++;
- if(incr > size) {
-
- split /= 2;
- if( !split ) break;
- incr = 1;
- size = count - split;
-
- } /* end while */
-
- } /* end while */
-
-
- return count;
-
-} /* end shellsort */
-
-
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-indent-level: 8
+ * c-brace-imaginary-offset: 0
+ * c-brace-offset: -8
+ * c-argdecl-indent: 8
+ * c-label-offset: -8
+ * c-continued-statement-offset: 8
+ * c-continued-brace-offset: 0
+ * End:
+ */

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu