diff -ur linux.orig/drivers/md/md.c linux/drivers/md/md.c --- linux.orig/drivers/md/md.c Sat Aug 3 02:39:44 2002 +++ linux/drivers/md/md.c Thu Sep 12 19:31:46 2002 @@ -782,7 +782,7 @@ static void print_rdev(mdk_rdev_t *rdev) { - printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d\n", partition_name(rdev->dev), partition_name(rdev->old_dev), rdev->size, rdev->faulty, rdev->desc_nr); if (rdev->sb) { @@ -984,7 +984,7 @@ struct md_list_head *tmp; ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty || rdev->alias_device) + if (rdev->faulty || rdev_is_alias(rdev)) continue; sb = rdev->sb; *sb = *mddev->sb; @@ -1036,11 +1036,11 @@ printk(KERN_INFO "md: "); if (rdev->faulty) printk("(skipping faulty "); - if (rdev->alias_device) + if (rdev_is_alias(rdev)) printk("(skipping alias "); printk("%s ", partition_name(rdev->dev)); - if (!rdev->faulty && !rdev->alias_device) { + if (!rdev->faulty && !rdev_is_alias(rdev)) { printk("[events: %08lx]", (unsigned long)rdev->sb->events_lo); err += write_disk_sb(rdev); @@ -1220,8 +1220,10 @@ if (alloc_array_sb(mddev)) goto abort; sb = mddev->sb; + + /* Find the freshest superblock */ freshest = NULL; - + ITERATE_RDEV(mddev,rdev,tmp) { __u64 ev1, ev2; /* @@ -1258,154 +1260,170 @@ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); } memcpy (sb, freshest->sb, sizeof(*sb)); - - /* - * at this point we have picked the 'best' superblock - * from all available superblocks. - * now we validate this superblock and kick out possibly - * failed disks. - */ - ITERATE_RDEV(mddev,rdev,tmp) { - /* - * Kick all non-fresh devices - */ - __u64 ev1, ev2; - ev1 = md_event(rdev->sb); - ev2 = md_event(sb); - ++ev1; - if (ev1 < ev2) { - printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", - partition_name(rdev->dev)); - kick_rdev_from_array(rdev); - continue; - } - } - - /* - * Fix up changed device names ... but only if this disk has a - * recent update time. Use faulty checksum ones too. - */ - if (mddev->sb->level != -4) - ITERATE_RDEV(mddev,rdev,tmp) { - __u64 ev1, ev2, ev3; - if (rdev->faulty || rdev->alias_device) { - MD_BUG(); - goto abort; - } - ev1 = md_event(rdev->sb); - ev2 = md_event(sb); - ev3 = ev2; - --ev3; - if ((rdev->dev != rdev->old_dev) && - ((ev1 == ev2) || (ev1 == ev3))) { + + /* For multipathing, lots of things are different from "true" + * RAIDs. + * All rdev's could be read, so they are no longer faulty. + * As there is just one sb, trying to find changed devices via the + * this_disk pointer is useless too. + * + * lmb@suse.de, 2002-09-12 + */ + if (sb->level == -4) { + int desc_nr = 0; + + /* ... and initialize from the current rdevs instead */ + ITERATE_RDEV(mddev,rdev,tmp) { mdp_disk_t *desc; - - printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", - partition_name(rdev->old_dev), partition_name(rdev->dev)); - if (rdev->desc_nr == -1) { - MD_BUG(); - goto abort; - } + + rdev->desc_nr=desc_nr; + desc = &sb->disks[rdev->desc_nr]; - if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { - MD_BUG(); - goto abort; - } + + desc->raid_disk = desc_nr; + desc->number = desc_nr; desc->major = MAJOR(rdev->dev); desc->minor = MINOR(rdev->dev); desc = &rdev->sb->this_disk; desc->major = MAJOR(rdev->dev); desc->minor = MINOR(rdev->dev); - } - } - /* - * Remove unavailable and faulty devices ... - * - * note that if an array becomes completely unrunnable due to - * missing devices, we do not write the superblock back, so the - * administrator has a chance to fix things up. The removal thus - * only happens if it's nonfatal to the contents of the array. - */ - for (i = 0; i < MD_SB_DISKS; i++) { - int found; - mdp_disk_t *desc; - kdev_t dev; + /* We could read from it, so it isn't faulty + * any longer */ + if (disk_faulty(desc)) + mark_disk_spare(desc); + + desc_nr++; + } - desc = sb->disks + i; - dev = MKDEV(desc->major, desc->minor); + /* Kick out all old info about disks we used to have, + * if any */ + for (i = desc_nr; i < MD_SB_DISKS; i++) + memset(&(sb->disks[i]),0,sizeof(mdp_disk_t)); + } else { /* - * We kick faulty devices/descriptors immediately. + * at this point we have picked the 'best' superblock + * from all available superblocks. * - * Note: multipath devices are a special case. Since we - * were able to read the superblock on the path, we don't - * care if it was previously marked as faulty, it's up now - * so enable it. + * now we validate this superblock and kick out possibly + * failed disks. */ - if (disk_faulty(desc) && mddev->sb->level != -4) { - found = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr != desc->number) - continue; - printk(KERN_WARNING "md%d: kicking faulty %s!\n", - mdidx(mddev),partition_name(rdev->dev)); - kick_rdev_from_array(rdev); - found = 1; - break; - } - if (!found) { - if (dev == MKDEV(0,0)) - continue; - printk(KERN_WARNING "md%d: removing former faulty %s!\n", - mdidx(mddev), partition_name(dev)); - } - remove_descriptor(desc, sb); - continue; - } else if (disk_faulty(desc)) { + ITERATE_RDEV(mddev,rdev,tmp) { /* - * multipath entry marked as faulty, unfaulty it + * Kick all non-fresh devices */ - rdev = find_rdev(mddev, dev); - if(rdev) - mark_disk_spare(desc); - else - remove_descriptor(desc, sb); + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } } - if (dev == MKDEV(0,0)) - continue; /* - * Is this device present in the rdev ring? + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. */ - found = 0; ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + /* - * Multi-path IO special-case: since we have no - * this_disk descriptor at auto-detect time, - * we cannot check rdev->number. - * We can check the device though. + * We kick faulty devices/descriptors immediately. */ - if ((sb->level == -4) && (rdev->dev == - MKDEV(desc->major,desc->minor))) { - found = 1; - break; + if (disk_faulty(desc)) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; } - if (rdev->desc_nr == desc->number) { - found = 1; - break; + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); } - if (found) - continue; - printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", - mdidx(mddev), partition_name(dev)); - remove_descriptor(desc, sb); } /* - * Double check wether all devices mentioned in the + * Double check whether all devices mentioned in the * superblock are in the rdev ring. */ first = 1; @@ -1429,23 +1447,6 @@ MD_BUG(); goto abort; } - /* - * In the case of Multipath-IO, we have no - * other information source to find out which - * disk is which, only the position of the device - * in the superblock: - */ - if (mddev->sb->level == -4) { - if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { - MD_BUG(); - goto abort; - } - rdev->desc_nr = i; - if (!first) - rdev->alias_device = 1; - else - first = 0; - } } /* @@ -1460,31 +1461,25 @@ /* * Do a final reality check. */ - if (mddev->sb->level != -4) { - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->desc_nr == -1) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + ITERATE_RDEV(mddev,rdev2,tmp2) { + /* is the desc_nr unique? */ + if (rdev2 == rdev) + continue; + + if (rdev2->desc_nr == rdev->desc_nr) { MD_BUG(); goto abort; } - /* - * is the desc_nr unique? - */ - ITERATE_RDEV(mddev,rdev2,tmp2) { - if ((rdev2 != rdev) && - (rdev2->desc_nr == rdev->desc_nr)) { - MD_BUG(); - goto abort; - } - } - /* - * is the device unique? - */ - ITERATE_RDEV(mddev,rdev2,tmp2) { - if ((rdev2 != rdev) && - (rdev2->dev == rdev->dev)) { - MD_BUG(); - goto abort; - } + + /* is the device unique? */ + if (rdev2->dev == rdev->dev) { + MD_BUG(); + goto abort; } } } @@ -1909,6 +1904,28 @@ */ mddev->sb_dirty = 0; do_md_stop (mddev, 0); + } else { + /* Create an rdev for the freshly started md device + * and add to the end of the list */ + kdev_t dev = MKDEV(MD_MAJOR,mddev->__minor); + if (md_import_device(dev,1)) { + printk("md: no nested md device found\n"); + } else { + rdev = find_rdev_all(dev); + /* This should all be impossible because we _just_ + * imported the device! */ + if (!rdev) { + MD_BUG(); + return; + } + if (rdev->faulty) { + MD_BUG(); + return; + } + printk("md: added md%d to the autodetection\n", + mdidx(mddev)); + md_list_add(&rdev->pending, pending_raid_disks.prev); + } } } @@ -2337,10 +2354,9 @@ return -EINVAL; } disk = &mddev->sb->disks[rdev->desc_nr]; - if (disk_active(disk)) { - MD_BUG(); + if (disk_active(disk)) goto busy; - } + if (disk_removed(disk)) { MD_BUG(); return -EINVAL; @@ -2368,6 +2384,50 @@ return -EBUSY; } +/* + * find the maching mdk_disk_t and pass to diskops + */ +static int set_mp_params(mddev_t * mddev, kdev_t dev, int state) +{ + mdp_disk_t *disk; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + if (!mddev->pers->diskop) + return -ENXIO; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + + disk = &mddev->sb->disks[rdev->desc_nr]; + +#if 0 + printk("set_mp_params: state: %d dev: %s disk: %s (%d)\n", + state, + partition_name(dev), + partition_name(MKDEV(disk->major,disk->minor)), + disk->number); + print_rdev(rdev); +#endif + switch (state) { + case SET_DISK_ACTIVE: + return mddev->pers->diskop(mddev, &disk, DISKOP_ACTIVATE_PATH); + case SET_DISK_INACTIVE: + return mddev->pers->diskop(mddev, &disk, DISKOP_DISABLE_PATH); + case SET_DISK_CLEAN: + return mddev->pers->diskop(mddev, &disk, DISKOP_CLEAN_PATH); + } + + return -EINVAL; +} + static int hot_add_disk(mddev_t * mddev, kdev_t dev) { int i, err, persistent; @@ -2859,6 +2919,12 @@ goto done_unlock; } + case SET_DISK_CLEAN: + case SET_DISK_ACTIVE: + case SET_DISK_INACTIVE: + err = set_mp_params(mddev, (kdev_t) arg, cmd); + goto done_unlock; + default: printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " "upgrade your software to use new ictls.\n", @@ -3274,6 +3340,7 @@ } disk = &sb->disks[rdev->desc_nr]; if (disk_faulty(disk)) { + print_desc(disk); MD_BUG(); continue; } @@ -3521,6 +3588,8 @@ sb = mddev->sb; if (!sb) continue; + if (sb->level == -4) + continue; if (mddev->recovery_running) continue; if (sb->active_disks == sb->raid_disks) @@ -4047,6 +4116,7 @@ MD_EXPORT_SYMBOL(md_wakeup_thread); MD_EXPORT_SYMBOL(md_print_devices); MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(find_rdev); MD_EXPORT_SYMBOL(md_interrupt_thread); MD_EXPORT_SYMBOL(mddev_map); MD_EXPORT_SYMBOL(md_check_ordering); diff -ur linux.orig/drivers/md/multipath.c linux/drivers/md/multipath.c --- linux.orig/drivers/md/multipath.c Mon Feb 25 20:37:58 2002 +++ linux/drivers/md/multipath.c Thu Sep 12 19:02:04 2002 @@ -52,6 +52,7 @@ struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail; static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state); +static void print_multipath_conf (multipath_conf_t *conf); @@ -190,6 +191,9 @@ void multipath_end_request (struct buffer_head *bh, int uptodate) { struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private); + struct multipath_info * multipath = mp_bh->multipath; + + atomic_dec(&multipath->nr_pending_buffers); /* * this branch is our 'one multipath IO has finished' event handler: @@ -224,18 +228,66 @@ /* * This routine returns the disk from which the requested read should - * be done. + * be done. If possible, select the path where we can be lucky enough that + * the buffer will merge with the previous request. If no matches found + * there, just select the least busy path */ - -static int multipath_read_balance (multipath_conf_t *conf) +static int multipath_rw_balance (multipath_conf_t *conf, int rw, + struct buffer_head *bh) { - int disk; + int disk, best_so_far = -1, nr_pending = 0, tmp, first = 1; - for (disk = 0; disk < conf->raid_disks; disk++) - if (conf->multipaths[disk].operational) - return disk; - BUG(); - return 0; + for (disk = 0; disk < conf->raid_disks; disk++) { + struct multipath_info *mpi = &conf->multipaths[disk]; + + if (!mpi->operational) + continue; + + if (first) { + best_so_far = disk; + +#if 1 + /* + * WRITEs, choose first operational path always + */ + if (rw == WRITE) + break; +#endif + + nr_pending = atomic_read(&mpi->nr_pending_buffers); + } + + /* + * check for possible front- or back merge + */ + if (rw == mpi->last_rw) { + + if (bh->b_rsector == mpi->last_end_sector + || bh->b_rsector + (bh->b_size >> 9) == mpi->last_start_sector) { + best_so_far = disk; + break; + } + } + + /* + * don't check pending for first disk, already did + */ + if (first) { + first = 0; + continue; + } + + /* + * just a ball park number, good enough for balancing + */ + tmp = atomic_read(&mpi->nr_pending_buffers); + if (tmp < nr_pending) { + nr_pending = tmp; + best_so_far = disk; + } + } + + return best_so_far; } static int multipath_make_request (mddev_t *mddev, int rw, @@ -245,6 +297,7 @@ struct buffer_head *bh_req; struct multipath_bh * mp_bh; struct multipath_info *multipath; + int ret; if (!buffer_locked(bh)) BUG(); @@ -265,16 +318,28 @@ mp_bh->cmd = rw; /* - * read balancing logic: + * read/write balancing logic: */ - multipath = conf->multipaths + multipath_read_balance(conf); + if ((ret = multipath_rw_balance(conf, rw, bh)) == -1) { + print_multipath_conf(conf); + MD_BUG(); + buffer_IO_error(bh); + return 0; + } + + multipath = conf->multipaths + ret; + + multipath->last_end_sector = bh->b_rsector + (bh->b_size >> 9); + multipath->last_start_sector = bh->b_rsector; + multipath->last_rw = rw; + + atomic_inc(&multipath->nr_pending_buffers); + mp_bh->multipath = multipath; bh_req = &mp_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); - bh_req->b_blocknr = bh->b_rsector; bh_req->b_dev = multipath->dev; bh_req->b_rdev = multipath->dev; -/* bh_req->b_rsector = bh->n_rsector; */ bh_req->b_end_io = multipath_end_request; bh_req->b_private = mp_bh; generic_make_request (rw, bh_req); @@ -305,24 +370,123 @@ "multipath: IO failure on %s, disabling IO path. \n" \ " Operation continuing on %d IO paths.\n" +static inline void clear_disk_faulty(mdp_disk_t *d) +{ + d->state &= ~(1 << MD_DISK_FAULTY); +} + +static void verify_positions(mddev_t *mddev, const char *string) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + int i; + + printk("verify_positions: %s\n", string); + + for (i = 0; i < sb->raid_disks; i++) { + desc = &sb->disks[i]; + if (desc->number != i) + printk("desc %p is in spot %d, but names %d\n", desc, i, desc->number); + } +} + +/* + * switch positions 1 and 2 in the array list + */ +static void multipath_switch_pos(multipath_conf_t *conf, mdp_disk_t *desc, + mdp_disk_t *desc2) +{ + struct multipath_info *disk, *disk2; + mdk_rdev_t *rdev1, *rdev2; + + verify_positions(conf->mddev, "switch_pos_begin"); + + disk = conf->multipaths + desc->raid_disk; + disk2 = conf->multipaths + desc2->raid_disk; + + xchg_values(*desc2,*desc); + xchg_values(*disk2,*disk); + xchg_values(desc2->number, desc->number); + xchg_values(disk2->number, disk->number); + xchg_values(desc2->raid_disk, desc->raid_disk); + xchg_values(disk2->raid_disk, disk->raid_disk); + + rdev1 = find_rdev_nr(conf->mddev, desc->raid_disk); + rdev2 = find_rdev_nr(conf->mddev, desc2->raid_disk); + + if (rdev1 && rdev2) + xchg_values(rdev1->desc_nr, rdev2->desc_nr); + + verify_positions(conf->mddev, "switch_pos_end"); +} + +static void mark_disk_good (mddev_t *mddev, int disk) +{ + multipath_conf_t *conf = mddev_to_conf(mddev); + struct multipath_info *multipath = conf->multipaths + disk; + mdp_super_t *sb = mddev->sb; + mdk_rdev_t *rdev; + + rdev = find_rdev_nr(mddev, disk); + rdev->faulty = 0; + + multipath->spare = 1; + if (!disk_faulty(sb->disks+multipath->number)) { + printk("mark_disk_good: disk %d already good\n",disk); + return; + } + + clear_disk_faulty(sb->disks+multipath->number); + sb->failed_disks--; + + sb->working_disks++; + sb->spare_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks++; + +} + static void mark_disk_bad (mddev_t *mddev, int failed) { multipath_conf_t *conf = mddev_to_conf(mddev); struct multipath_info *multipath = conf->multipaths+failed; mdp_super_t *sb = mddev->sb; + int last_drive = conf->working_disks - 1; + + printk("md: mark_disk_bad: disk %d\n", failed); multipath->operational = 0; + if (disk_faulty(sb->disks+multipath->number)) { + printk("md: Disk already marked bad. Doing nothing!\n"); + return; + } + mark_disk_faulty(sb->disks+multipath->number); mark_disk_nonsync(sb->disks+multipath->number); - mark_disk_inactive(sb->disks+multipath->number); - sb->active_disks--; + if (disk_active(sb->disks+multipath->number)) { + printk("Deactivating disk\n"); + mark_disk_inactive(sb->disks+multipath->number); + sb->active_disks--; + conf->raid_disks--; + sb->raid_disks--; + } + conf->working_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; - md_wakeup_thread(conf->thread); - conf->working_disks--; + printk (DISK_FAILED, partition_name (multipath->dev), conf->working_disks); + + /* + * switch the faulty drive with the last one, so the working ones + * are always at the front of the array + */ + if (failed != last_drive) + multipath_switch_pos(conf, &sb->disks[failed], &sb->disks[last_drive]); + + md_wakeup_thread(conf->thread); } /* @@ -334,6 +498,7 @@ struct multipath_info * multipaths = conf->multipaths; int disks = MD_SB_DISKS; int other_paths = 1; + mdk_rdev_t *rrdev; int i; if (conf->working_disks == 1) { @@ -354,7 +519,7 @@ */ for (i = 0; i < disks; i++) { if (multipaths[i].dev==dev && !multipaths[i].operational) - return 0; + goto out; } printk (LAST_DISK); } else { @@ -367,33 +532,38 @@ break; } } - if (!conf->working_disks) { - int err = 1; + rrdev = find_rdev(mddev, dev); + if (rrdev) + rrdev->faulty = 1; + + if (conf->working_disks > 0) { mdp_disk_t *spare; - mdp_super_t *sb = mddev->sb; - + spare = get_spare(mddev); if (spare) { - err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE); - printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare)); - } - if (!err && !disk_faulty(spare)) { - multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); - mark_disk_sync(spare); - mark_disk_active(spare); - sb->active_disks++; - sb->spare_disks--; + if (!disk_faulty(spare)) { + multipath_diskop(mddev, + &spare, DISKOP_ACTIVATE_PATH); + } else { + printk("md: Our spare is faulty!\n"); + print_multipath_conf(conf); + MD_BUG(); + } } + } else { + printk(NO_SPARE_DISK); } } - return 0; + +out: + + return 2; } #undef LAST_DISK #undef NO_SPARE_DISK #undef DISK_FAILED - static void print_multipath_conf (multipath_conf_t *conf) { int i; @@ -426,59 +596,82 @@ struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; mdp_super_t *sb = mddev->sb; mdp_disk_t *failed_desc, *spare_desc, *added_desc; - mdk_rdev_t *spare_rdev, *failed_rdev; +#if 0 print_multipath_conf(conf); +#endif md_spin_lock_irq(&conf->device_lock); /* * find the disk ... */ switch (state) { - case DISKOP_SPARE_ACTIVE: + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + printk(KERN_ERR "md: multipath does not support recovery operations\n"); + goto abort; + break; + + case DISKOP_ACTIVATE_PATH: + + for (i = 0; i < conf->nr_disks; i++) { + tmp = conf->multipaths + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + err = -EBUSY; + goto abort; + } + + failed_disk = conf->working_disks; + break; + + case DISKOP_DISABLE_PATH: + + if (conf->raid_disks == 1) { + printk("multipath: can't disable last path\n"); + err = -EBUSY; + goto abort; + } - /* - * Find the failed disk within the MULTIPATH configuration ... - * (this can only be in the first conf->working_disks part) - */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->multipaths + i; - if ((!tmp->operational && !tmp->spare) || - !tmp->used_slot) { - failed_disk = i; + if (tmp->operational && tmp->number == (*d)->number) { + removed_disk = i; break; } } - /* - * When we activate a spare disk we _must_ have a disk in - * the lower (active) part of the array to replace. - */ - if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { - MD_BUG(); - err = 1; + + if (removed_disk == -1) { + err = -EINVAL; goto abort; } - /* fall through */ - case DISKOP_SPARE_WRITE: - case DISKOP_SPARE_INACTIVE: + failed_disk = conf->raid_disks - 1; + break; + case DISKOP_CLEAN_PATH: /* - * Find the spare disk ... (can only be in the 'high' - * area of the array) + * find faulty drive */ - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + for (i = 0; i < conf->nr_disks; i++) { tmp = conf->multipaths + i; - if (tmp->spare && tmp->number == (*d)->number) { - spare_disk = i; + if (!tmp->operational && !tmp->spare + && tmp->number == (*d)->number) { + failed_disk = i; break; } } - if (spare_disk == -1) { - MD_BUG(); - err = 1; + + if (failed_disk == -1) { + err = -EINVAL; goto abort; } + + spare_disk = conf->working_disks; break; case DISKOP_HOT_REMOVE_DISK: @@ -520,31 +713,60 @@ } switch (state) { - /* - * Switch the spare disk to write-only mode: - */ - case DISKOP_SPARE_WRITE: - sdisk = conf->multipaths + spare_disk; - sdisk->operational = 1; + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->multipaths + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + rdisk->number = 0; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->nr_disks--; break; - /* - * Deactivate a spare disk: - */ - case DISKOP_SPARE_INACTIVE: - sdisk = conf->multipaths + spare_disk; - sdisk->operational = 0; + case DISKOP_HOT_ADD_DISK: + adisk = conf->multipaths + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + printk("DISKOP_HOT_ADD_DISK: we want slot %d != md wants %d\n", + added_disk, added_desc->number); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->spare = 1; + adisk->used_slot = 1; + conf->nr_disks++; + /* Believing it to be good until proven otherwise ;-) */ + conf->working_disks++; + + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + break; - /* - * Activate (mark read-write) the (now sync) spare disk, - * which means we switch it's 'raid position' (->raid_disk) - * with the failed disk. (only the first 'conf->nr_disks' - * slots are used for 'real' disks and we must preserve this - * property) - */ - case DISKOP_SPARE_ACTIVE: + + case DISKOP_ACTIVATE_PATH: sdisk = conf->multipaths + spare_disk; fdisk = conf->multipaths + failed_disk; + printk("md: activate: %s\n", + partition_name(sdisk->dev)); + + if (!sdisk->spare) { + printk("md: activate: not spare\n"); + goto abort; + } + spare_desc = &sb->disks[sdisk->number]; failed_desc = &sb->disks[fdisk->number]; @@ -572,90 +794,117 @@ goto abort; } +#if 0 if (fdisk->raid_disk != failed_disk) { MD_BUG(); err = 1; goto abort; } +#endif + + sdisk->spare = 0; + sdisk->operational = 1; /* * do the switch finally */ - spare_rdev = find_rdev_nr(mddev, spare_desc->number); - failed_rdev = find_rdev_nr(mddev, failed_desc->number); - xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr); - spare_rdev->alias_device = 0; - failed_rdev->alias_device = 1; - - xchg_values(*spare_desc, *failed_desc); - xchg_values(*fdisk, *sdisk); + if (failed_disk < spare_disk) { + multipath_switch_pos(conf, spare_desc, failed_desc); - /* - * (careful, 'failed' and 'spare' are switched from now on) - * - * we want to preserve linear numbering and we want to - * give the proper raid_disk number to the now activated - * disk. (this means we switch back these values) - */ - - xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); - xchg_values(sdisk->raid_disk, fdisk->raid_disk); - xchg_values(spare_desc->number, failed_desc->number); - xchg_values(sdisk->number, fdisk->number); + *d = failed_desc; - *d = failed_desc; + if (sdisk->dev == MKDEV(0,0)) + MD_BUG(); + } - if (sdisk->dev == MKDEV(0,0)) - sdisk->used_slot = 0; /* * this really activates the spare. */ - fdisk->spare = 0; + mark_disk_active(*d); + conf->raid_disks++; + sb->raid_disks++; + sb->active_disks++; + mddev->sb->spare_disks--; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + break; + + case DISKOP_DISABLE_PATH: + sdisk = conf->multipaths + removed_disk; + fdisk = conf->multipaths + failed_disk; + + printk("md: deactivate: %s\n", + partition_name(sdisk->dev)); + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + printk("diskop_disable_path: we want %s (%d), md said %s (%d)\n", + partition_name(MKDEV(spare_desc->major,spare_desc->minor)), + removed_disk, + partition_name(MKDEV((*d)->major,(*d)->minor)), + (*d)->number); + MD_BUG(); + err = 1; + goto abort; + } /* - * if we activate a spare, we definitely replace a - * non-operational disk slot in the 'low' area of - * the disk array. + * do the switch finally */ + if (removed_disk != failed_disk) { + multipath_switch_pos(conf, spare_desc, failed_desc); - conf->working_disks++; + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + MD_BUG(); + } + mark_disk_spare(*d); + mark_disk_inactive(*d); + fdisk->spare = 1; + fdisk->operational = 0; + conf->raid_disks--; + sb->raid_disks--; + sb->active_disks--; + mddev->sb->spare_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); break; - case DISKOP_HOT_REMOVE_DISK: - rdisk = conf->multipaths + removed_disk; + case DISKOP_CLEAN_PATH: + fdisk = conf->multipaths + failed_disk; - if (rdisk->spare && (removed_disk < conf->raid_disks)) { - MD_BUG(); + failed_desc = &sb->disks[fdisk->number]; + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); err = 1; goto abort; } - rdisk->dev = MKDEV(0,0); - rdisk->used_slot = 0; - conf->nr_disks--; - break; - - case DISKOP_HOT_ADD_DISK: - adisk = conf->multipaths + added_disk; - added_desc = *d; - if (added_disk != added_desc->number) { - MD_BUG(); + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); err = 1; goto abort; } - adisk->number = added_desc->number; - adisk->raid_disk = added_desc->raid_disk; - adisk->dev = MKDEV(added_desc->major,added_desc->minor); + /* + * this really activates the spare. + */ + fdisk->spare = 0; + mark_disk_good(mddev, failed_disk); - adisk->operational = 0; - adisk->spare = 1; - adisk->used_slot = 1; - conf->nr_disks++; + if (failed_disk != spare_disk) { + spare_desc = &sb->disks[spare_disk]; + multipath_switch_pos(conf, failed_desc, spare_desc); + } break; + default: MD_BUG(); err = 1; @@ -833,7 +1082,7 @@ mdp_disk_t *desc, *desc2; mdk_rdev_t *rdev, *def_rdev = NULL; struct md_list_head *tmp; - int num_rdevs = 0; + int num_rdevs = 0, nr_spares = 0; MOD_INC_USE_COUNT; @@ -854,6 +1103,7 @@ goto out; } memset(conf, 0, sizeof(*conf)); + conf->mddev = mddev; ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) { @@ -878,9 +1128,6 @@ disk_idx = desc->raid_disk; disk = conf->multipaths + disk_idx; - if (!disk_sync(desc)) - printk(NOT_IN_SYNC, partition_name(rdev->dev)); - /* * Mark all disks as spare to start with, then pick our * active disk. If we have a disk that is marked active @@ -892,24 +1139,34 @@ disk->operational = 0; disk->spare = 1; disk->used_slot = 1; - mark_disk_sync(desc); + /* + * bring paths online and spare. online paths will be + * read balanced automatically + */ if (disk_active(desc)) { - if(!conf->working_disks) { - printk(OPERATIONAL, partition_name(rdev->dev), - desc->raid_disk); - disk->operational = 1; - disk->spare = 0; - conf->working_disks++; + printk(OPERATIONAL, partition_name(rdev->dev), desc->raid_disk); + disk->operational = 1; + disk->spare = 0; + + if (!conf->working_disks++) def_rdev = rdev; - } else { - mark_disk_spare(desc); - } - } else + + conf->raid_disks++; +#if 0 + sb->raid_disks++; + sb->active_disks++; +#endif + } else { + nr_spares++; mark_disk_spare(desc); + } - if(!num_rdevs++) def_rdev = rdev; + if (!num_rdevs++) + def_rdev = rdev; } + + /* Make the first path active if no active path was found before */ if(!conf->working_disks && num_rdevs) { desc = &sb->disks[def_rdev->desc_nr]; disk = conf->multipaths + desc->raid_disk; @@ -917,16 +1174,18 @@ disk->raid_disk); disk->operational = 1; disk->spare = 0; + nr_spares--; conf->working_disks++; + conf->raid_disks++; mark_disk_active(desc); } /* * Make sure our active path is in desc spot 0 */ - if(def_rdev->desc_nr != 0) { - rdev = find_rdev_nr(mddev, 0); + if (def_rdev && def_rdev->desc_nr != 0) { desc = &sb->disks[def_rdev->desc_nr]; desc2 = sb->disks; + disk = conf->multipaths + desc->raid_disk; disk2 = conf->multipaths + desc2->raid_disk; xchg_values(*desc2,*desc); @@ -935,18 +1194,20 @@ xchg_values(disk2->number, disk->number); xchg_values(desc2->raid_disk, desc->raid_disk); xchg_values(disk2->raid_disk, disk->raid_disk); + + rdev = find_rdev_nr(mddev, 0); if(rdev) { xchg_values(def_rdev->desc_nr,rdev->desc_nr); } else { def_rdev->desc_nr = 0; } } - conf->raid_disks = sb->raid_disks = sb->active_disks = 1; conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; sb->failed_disks = 0; - sb->spare_disks = num_rdevs - 1; + sb->spare_disks = nr_spares; + sb->active_disks = num_rdevs - nr_spares; + sb->raid_disks = sb->active_disks; mddev->sb_dirty = 1; - conf->mddev = mddev; conf->device_lock = MD_SPIN_LOCK_UNLOCKED; init_waitqueue_head(&conf->wait_buffer); @@ -997,11 +1258,8 @@ * each device. */ for (i = 0; i < MD_SB_DISKS; i++) { - mark_disk_nonsync(sb->disks+i); - for (j = 0; j < sb->raid_disks; j++) { - if (sb->disks[i].number == conf->multipaths[j].number) - mark_disk_sync(sb->disks+i); - } + for (j = 0; j < sb->raid_disks; j++) + atomic_set(&conf->multipaths[j].nr_pending_buffers, 0); } printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, diff -ur linux.orig/include/linux/raid/md_k.h linux/include/linux/raid/md_k.h --- linux.orig/include/linux/raid/md_k.h Mon Nov 26 14:29:17 2001 +++ linux/include/linux/raid/md_k.h Thu Sep 12 14:27:18 2002 @@ -173,12 +173,16 @@ mdp_super_t *sb; unsigned long sb_offset; - int alias_device; /* device alias to the same disk */ int faulty; /* if faulty do not issue IO requests */ int desc_nr; /* descriptor index in the superblock */ }; +static inline int rdev_is_alias(mdk_rdev_t * rdev) +{ + return ((rdev->sb->level == -4) && (rdev->desc_nr != 0)); +} + /* * disk operations in a working array: */ @@ -187,6 +191,9 @@ #define DISKOP_SPARE_ACTIVE 2 #define DISKOP_HOT_REMOVE_DISK 3 #define DISKOP_HOT_ADD_DISK 4 +#define DISKOP_ACTIVATE_PATH 5 +#define DISKOP_DISABLE_PATH 6 +#define DISKOP_CLEAN_PATH 7 typedef struct mdk_personality_s mdk_personality_t; diff -ur linux.orig/include/linux/raid/md_u.h linux/include/linux/raid/md_u.h --- linux.orig/include/linux/raid/md_u.h Fri Sep 14 23:21:51 2001 +++ linux/include/linux/raid/md_u.h Thu Sep 12 14:27:18 2002 @@ -44,6 +44,11 @@ #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +/* multi-path extensions */ +#define SET_DISK_CLEAN _IO (MD_MAJOR, 0x35) +#define SET_DISK_ACTIVE _IO (MD_MAJOR, 0x36) +#define SET_DISK_INACTIVE _IO (MD_MAJOR, 0x37) + typedef struct mdu_version_s { int major; int minor; diff -ur linux.orig/include/linux/raid/multipath.h linux/include/linux/raid/multipath.h --- linux.orig/include/linux/raid/multipath.h Wed Sep 11 07:40:45 2002 +++ linux/include/linux/raid/multipath.h Thu Sep 12 14:29:10 2002 @@ -15,6 +15,11 @@ int spare; int used_slot; + + atomic_t nr_pending_buffers; + unsigned long last_start_sector; + unsigned long last_end_sector; + int last_rw; }; struct multipath_private_data { @@ -63,6 +68,7 @@ struct buffer_head *master_bh; struct buffer_head bh_req; struct multipath_bh *next_mp; /* next for retry or in free list */ + struct multipath_info *multipath; }; /* bits for multipath_bh.state */ #define MPBH_Uptodate 1