Re: [PATCH 2/2] Migrate zone cache from RB-Tree to arrays of descriptors

From: Shaun Tancheff
Date: Mon Aug 22 2016 - 01:26:21 EST


On Sun, Aug 21, 2016 at 11:34 PM, Shaun Tancheff <shaun@xxxxxxxxxxxx> wrote:
> Currently the RB-Tree zone cache is fast and flexible. It does
> use a rather largish amount of ram. This model reduces the ram
> required from 120 bytes per zone to 16 bytes per zone with a
> moderate transformation of the blk_zone_lookup() api.
>
> This model is predicated on the belief that most variations
> on zoned media will follow a pattern of using collections of same
> sized zones on a single device. Similar to the pattern of erase
> blocks on flash devices being progressivly larger 16K, 64K, ...
>
> The goal is to be able to build a descriptor which is both memory
> efficient, performant, and flexible.
>
> Signed-off-by: Shaun Tancheff <shaun.tancheff@xxxxxxxxxxx>
> ---
> block/blk-core.c | 2 +-
> block/blk-sysfs.c | 31 +-
> block/blk-zoned.c | 103 +++--
> drivers/scsi/sd.c | 5 +-
> drivers/scsi/sd.h | 4 +-
> drivers/scsi/sd_zbc.c | 1025 +++++++++++++++++++++++++++---------------------
> include/linux/blkdev.h | 82 +++-
> 7 files changed, 716 insertions(+), 536 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 3a9caf7..3b084a8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -727,7 +727,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
> INIT_LIST_HEAD(&q->blkg_list);
> #endif
> #ifdef CONFIG_BLK_DEV_ZONED
> - q->zones = RB_ROOT;
> + q->zones = NULL;
> #endif
> INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
>
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 43f441f..ecbd434 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -232,36 +232,7 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
> #ifdef CONFIG_BLK_DEV_ZONED
> static ssize_t queue_zoned_show(struct request_queue *q, char *page)
> {
> - struct rb_node *node;
> - struct blk_zone *zone;
> - ssize_t offset = 0, end = 0;
> - size_t size = 0, num = 0;
> - enum blk_zone_type type = BLK_ZONE_TYPE_UNKNOWN;
> -
> - for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> - zone = rb_entry(node, struct blk_zone, node);
> - if (zone->type != type ||
> - zone->len != size ||
> - end != zone->start) {
> - if (size != 0)
> - offset += sprintf(page + offset, "%zu\n", num);
> - /* We can only store one page ... */
> - if (offset + 42 > PAGE_SIZE) {
> - offset += sprintf(page + offset, "...\n");
> - return offset;
> - }
> - size = zone->len;
> - type = zone->type;
> - offset += sprintf(page + offset, "%zu %zu %d ",
> - zone->start, size, type);
> - num = 0;
> - end = zone->start + size;
> - } else
> - end += zone->len;
> - num++;
> - }
> - offset += sprintf(page + offset, "%zu\n", num);
> - return offset;
> + return sprintf(page, "%u\n", q->zones ? 1 : 0);
> }
> #endif
>
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 975e863..338a1af 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -8,63 +8,84 @@
> #include <linux/kernel.h>
> #include <linux/module.h>
> #include <linux/blkdev.h>
> -#include <linux/rbtree.h>
> +#include <linux/vmalloc.h>
>
> -struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t lba)
> +/**
> + * blk_lookup_zone() - Lookup zones
> + * @q: Request Queue
> + * @sector: Location to lookup
> + * @start: Pointer to starting location zone (OUT)
> + * @len: Pointer to length of zone (OUT)
> + * @lock: Pointer to spinlock of zones in owning descriptor (OUT)
> + */
> +struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector,
> + sector_t *start, sector_t *len,
> + spinlock_t **lock)
> {
> - struct rb_root *root = &q->zones;
> - struct rb_node *node = root->rb_node;
> + int iter;
> + struct blk_zone *bzone = NULL;
> + struct zone_wps *zi = q->zones;
> +
> + *start = 0;
> + *len = 0;
> + *lock = NULL;
> +
> + if (!q->zones)
> + goto out;
>
> - while (node) {
> - struct blk_zone *zone = container_of(node, struct blk_zone,
> - node);
> + for (iter = 0; iter < zi->wps_count; iter++) {
> + if (sector >= zi->wps[iter]->start_lba &&
> + sector < zi->wps[iter]->last_lba) {
> + struct contiguous_wps *wp = zi->wps[iter];
> + u64 index = (sector - wp->start_lba) / wp->zone_size;
>
> - if (lba < zone->start)
> - node = node->rb_left;
> - else if (lba >= zone->start + zone->len)
> - node = node->rb_right;
> - else
> - return zone;
> + if (index >= wp->zone_count) {
> + WARN(1, "Impossible index for zone\n");
> + goto out;
> + }
> +
> + bzone = &wp->zones[index];
> + *len = wp->zone_size;
> + *start = wp->start_lba + (index * wp->zone_size);
> + *lock = &wp->lock;
> + }
> }
> - return NULL;
> +
> +out:
> + return bzone;
> }
> EXPORT_SYMBOL_GPL(blk_lookup_zone);
>
> -struct blk_zone *blk_insert_zone(struct request_queue *q, struct blk_zone *data)
> +/**
> + * free_zone_wps() - Free up memory in use by wps
> + * @zi: zone wps array(s).
> + */
> +static void free_zone_wps(struct zone_wps *zi)
> {
> - struct rb_root *root = &q->zones;
> - struct rb_node **new = &(root->rb_node), *parent = NULL;
> + /* on error free the arrays */
> + if (zi && zi->wps) {
> + int ca;
>
> - /* Figure out where to put new node */
> - while (*new) {
> - struct blk_zone *this = container_of(*new, struct blk_zone,
> - node);
> - parent = *new;
> - if (data->start + data->len <= this->start)
> - new = &((*new)->rb_left);
> - else if (data->start >= this->start + this->len)
> - new = &((*new)->rb_right);
> - else {
> - /* Return existing zone */
> - return this;
> + for (ca = 0; ca < zi->wps_count; ca++) {
> + if (zi->wps[ca]) {
> + vfree(zi->wps[ca]);
> + zi->wps[ca] = NULL;
> + }
> }
> + kfree(zi->wps);
> }
> - /* Add new node and rebalance tree. */
> - rb_link_node(&data->node, parent, new);
> - rb_insert_color(&data->node, root);
> -
> - return NULL;
> }
> -EXPORT_SYMBOL_GPL(blk_insert_zone);
>
> +/**
> + * blk_drop_zones() - Free zones
> + * @q: Request Queue
> + */
> void blk_drop_zones(struct request_queue *q)
> {
> - struct rb_root *root = &q->zones;
> - struct blk_zone *zone, *next;
> -
> - rbtree_postorder_for_each_entry_safe(zone, next, root, node) {
> - kfree(zone);
> + if (q->zones) {
> + free_zone_wps(q->zones);
> + kfree(q->zones);
> + q->zones = NULL;
> }
> - q->zones = RB_ROOT;
> }
> EXPORT_SYMBOL_GPL(blk_drop_zones);
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index f144df4..0f749f5 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -2549,8 +2549,9 @@ got_data:
> sdkp->physical_block_size);
> sdkp->device->sector_size = sector_size;
>
> - if (sd_zbc_config(sdkp, buffer, SD_BUF_SIZE))
> - sd_config_discard(sdkp, SD_ZBC_RESET_WP);
> + if (sdkp->first_scan)
> + if (sd_zbc_config(sdkp, GFP_KERNEL))
> + sd_config_discard(sdkp, SD_ZBC_RESET_WP);
>
> {
> char cap_str_2[10], cap_str_10[10];
> diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
> index fc766db..c9c79e9 100644
> --- a/drivers/scsi/sd.h
> +++ b/drivers/scsi/sd.h
> @@ -299,13 +299,13 @@ extern void sd_zbc_uninit_command(struct scsi_cmnd *cmd);
> extern void sd_zbc_remove(struct scsi_disk *);
> extern void sd_zbc_reset_zones(struct scsi_disk *);
> extern void sd_zbc_update_zones(struct scsi_disk *, sector_t, int, int reason);
> -extern bool sd_zbc_config(struct scsi_disk *, void *, size_t);
> +extern bool sd_zbc_config(struct scsi_disk *, gfp_t);
>
> extern unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp);
>
> #else /* CONFIG_SCSI_ZBC */
>
> -static inline bool sd_zbc_config(struct scsi_disk *sdkp, void *b, size_t sz)
> +static inline bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp)
> {
> return false;
> }
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 960af93..c087035 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -22,6 +22,7 @@
>
> #include <linux/blkdev.h>
> #include <linux/rbtree.h>
> +#include <linux/vmalloc.h>
>
> #include <asm/unaligned.h>
>
> @@ -51,11 +52,11 @@
> } while( 0 )
>
> struct zbc_update_work {
> - struct work_struct zone_work;
> - struct scsi_disk *sdkp;
> - sector_t zone_sector;
> - int zone_buflen;
> - char zone_buf[0];
> + struct work_struct zone_work;
> + struct scsi_disk *sdkp;
> + sector_t zone_sector;
> + int zone_buflen;
> + struct bdev_zone_report zone_buf[0];
> };
>
> /**
> @@ -95,102 +96,19 @@ static inline sector_t get_start_from_desc(struct scsi_disk *sdkp,
> return logical_to_sectors(sdkp->device, be64_to_cpu(bzde->lba_start));
> }
>
> -static
> -struct blk_zone *zbc_desc_to_zone(struct scsi_disk *sdkp, unsigned char *rec)
> +static void _fill_zone(struct blk_zone *zone, struct scsi_disk *sdkp,
> + struct bdev_zone_descriptor *bzde)
> {
> - struct blk_zone *zone;
> - sector_t wp = (sector_t)-1;
> -
> - zone = kzalloc(sizeof(struct blk_zone), GFP_KERNEL);
> - if (!zone)
> - return NULL;
> -
> - spin_lock_init(&zone->lock);
> - zone->type = rec[0] & 0xf;
> - zone->state = (rec[1] >> 4) & 0xf;
> - zone->len = logical_to_sectors(sdkp->device,
> - get_unaligned_be64(&rec[8]));
> - zone->start = logical_to_sectors(sdkp->device,
> - get_unaligned_be64(&rec[16]));
> -
> - if (blk_zone_is_smr(zone))
> - wp = logical_to_sectors(sdkp->device,
> - get_unaligned_be64(&rec[24]));
> - zone->wp = wp;
> - /*
> - * Fixup block zone state
> - */
> - if (zone->state == BLK_ZONE_EMPTY &&
> - zone->wp != zone->start) {
> - sd_zbc_debug(sdkp,
> - "zone %zu state EMPTY wp %zu: adjust wp\n",
> - zone->start, zone->wp);
> - zone->wp = zone->start;
> - }
> - if (zone->state == BLK_ZONE_FULL &&
> - zone->wp != zone->start + zone->len) {
> - sd_zbc_debug(sdkp,
> - "zone %zu state FULL wp %zu: adjust wp\n",
> - zone->start, zone->wp);
> - zone->wp = zone->start + zone->len;
> - }
> -
> - return zone;
> + zone->type = bzde->type & 0x0f;
> + zone->state = (bzde->flags >> 4) & 0x0f;
> + zone->wp = get_wp_from_desc(sdkp, bzde);
> }
>
> -static
> -sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
> - unsigned int buf_len)
> -{
> - struct request_queue *q = sdkp->disk->queue;
> - unsigned char *rec = buf;
> - int rec_no = 0;
> - unsigned int list_length;
> - sector_t next_sector = -1;
> - u8 same;
> -
> - /* Parse REPORT ZONES header */
> - list_length = get_unaligned_be32(&buf[0]);
> - same = buf[4] & 0xf;
> - rec = buf + 64;
> - list_length += 64;
> -
> - if (list_length < buf_len)
> - buf_len = list_length;
> -
> - while (rec < buf + buf_len) {
> - struct blk_zone *this, *old;
> - unsigned long flags;
>
> - this = zbc_desc_to_zone(sdkp, rec);
> - if (!this)
> - break;
> -
> - if (same == 0 && this->len != zlen) {
> - next_sector = this->start + this->len;
> - break;
> - }
> -
> - next_sector = this->start + this->len;
> - old = blk_insert_zone(q, this);
> - if (old) {
> - spin_lock_irqsave(&old->lock, flags);
> - if (blk_zone_is_smr(old)) {
> - old->wp = this->wp;
> - old->state = this->state;
> - }
> - spin_unlock_irqrestore(&old->lock, flags);
> - kfree(this);
> - }
> - rec += 64;
> - rec_no++;
> - }
> -
> - sd_zbc_debug(sdkp,
> - "Inserted %d zones, next sector %zu len %d\n",
> - rec_no, next_sector, list_length);
> -
> - return next_sector;
> +static void fill_zone(struct contiguous_wps *cwps, int z_count,
> + struct scsi_disk *sdkp, struct bdev_zone_descriptor *bzde)
> +{
> + _fill_zone(&cwps->zones[z_count], sdkp, bzde);
> }
>
> /**
> @@ -200,12 +118,10 @@ sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
> * @bufflen: length of @buffer
> * @start_sector: logical sector for the zone information should be reported
> * @option: reporting option to be used
> - * @partial: flag to set the 'partial' bit for report zones command
> */
> -static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
> - int bufflen, sector_t start_sector,
> - enum zbc_zone_reporting_options option,
> - bool partial)
> +static int sd_zbc_report_zones(struct scsi_disk *sdkp,
> + struct bdev_zone_report *buffer,
> + int bufflen, sector_t start_sector, u8 option)
> {
> struct scsi_device *sdp = sdkp->device;
> const int timeout = sdp->request_queue->rq_timeout
> @@ -225,7 +141,7 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
> cmd[1] = ZI_REPORT_ZONES;
> put_unaligned_be64(start_lba, &cmd[2]);
> put_unaligned_be32(bufflen, &cmd[10]);
> - cmd[14] = (partial ? ZBC_REPORT_ZONE_PARTIAL : 0) | option;
> + cmd[14] = option;
> memset(buffer, 0, bufflen);
>
> result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
> @@ -248,49 +164,38 @@ static void sd_zbc_refresh_zone_work(struct work_struct *work)
> container_of(work, struct zbc_update_work, zone_work);
> struct scsi_disk *sdkp = zbc_work->sdkp;
> struct request_queue *q = sdkp->disk->queue;
> - unsigned char *zone_buf = zbc_work->zone_buf;
> + struct bdev_zone_report *rpt = zbc_work->zone_buf;
> unsigned int zone_buflen = zbc_work->zone_buflen;
> + struct bdev_zone_descriptor *bzde;
> + int iter;
> + int offmax;
> + sector_t z_at, z_start, z_len;
> + spinlock_t *lock;
> + struct blk_zone *zone;
> int ret;
> - u8 same;
> - u64 zlen = 0;
> sector_t last_sector;
> sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
>
> - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
> + ret = sd_zbc_report_zones(sdkp, rpt, zone_buflen,
> zbc_work->zone_sector,
> - ZBC_ZONE_REPORTING_OPTION_ALL, true);
> + ZBC_ZONE_REPORTING_OPTION_ALL);
> if (ret)
> goto done_free;
>
> - /* this whole path is unlikely so extra reports shouldn't be a
> - * large impact */
> - same = zone_buf[4] & 0xf;
> - if (same == 0) {
> - unsigned char *desc = &zone_buf[64];
> - unsigned int blen = zone_buflen;
> -
> - /* just pull the first zone */
> - if (blen > 512)
> - blen = 512;
> - ret = sd_zbc_report_zones(sdkp, zone_buf, blen, 0,
> - ZBC_ZONE_REPORTING_OPTION_ALL, true);
> - if (ret)
> - goto done_free;
> -
> - /* Read the zone length from the first zone descriptor */
> - zlen = logical_to_sectors(sdkp->device,
> - get_unaligned_be64(&desc[8]));
> -
> - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
> - zbc_work->zone_sector,
> - ZBC_ZONE_REPORTING_OPTION_ALL, true);
> - if (ret)
> - goto done_free;
> + offmax = max_report_entries(zone_buflen);
> + for (iter = 0; iter < offmax; iter++) {
> + bzde = &rpt->descriptors[iter];
> + z_at = get_start_from_desc(sdkp, bzde);
> + if (!z_at)
> + break;
> + zone = blk_lookup_zone(q, z_at, &z_start, &z_len, &lock);
> + if (zone) {
> + _fill_zone(zone, sdkp, bzde);
> + last_sector = z_start + z_len;
> + }
> }
>
> - last_sector = zbc_parse_zones(sdkp, zlen, zone_buf, zone_buflen);
> - capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> - if (last_sector != -1 && last_sector < capacity) {
> + if (sdkp->zone_work_q && last_sector != -1 && last_sector < capacity) {
> if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> sd_zbc_debug(sdkp,
> "zones in reset, canceling refresh\n");
> @@ -333,10 +238,7 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
> {
> struct request_queue *q = sdkp->disk->queue;
> struct zbc_update_work *zbc_work;
> - struct blk_zone *zone;
> - struct rb_node *node;
> - int zone_num = 0, zone_busy = 0, num_rec;
> - sector_t next_sector = sector;
> + int num_rec;
>
> if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> sd_zbc_debug(sdkp,
> @@ -346,18 +248,23 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
>
> if (reason != SD_ZBC_INIT) {
> /* lookup sector, is zone pref? then ignore */
> - struct blk_zone *zone = blk_lookup_zone(q, sector);
> -
> + sector_t z_start, z_len;
> + spinlock_t *lck;
> + struct blk_zone *zone = blk_lookup_zone(q, sector, &z_start,
> + &z_len, &lck);
> + /* zone actions on conventional zones are invalid */
> + if (zone && reason == SD_ZBC_RESET_WP && blk_zone_is_cmr(zone))
> + return;
> if (reason == SD_ZBC_RESET_WP)
> sd_zbc_debug(sdkp, "RESET WP failed %lx\n", sector);
> -
> - if (zone && blk_zone_is_seq_pref(zone))
> - return;
> }
>
> + if (!sdkp->zone_work_q)
> + return;
> +
> retry:
> zbc_work = kzalloc(sizeof(struct zbc_update_work) + bufsize,
> - reason != SD_ZBC_INIT ? GFP_NOWAIT : GFP_KERNEL);
> + reason != SD_ZBC_INIT ? GFP_ATOMIC : GFP_KERNEL);
> if (!zbc_work) {
> if (bufsize > 512) {
> sd_zbc_debug(sdkp,
> @@ -381,30 +288,40 @@ retry:
> * Mark zones under update as BUSY
> */
> if (reason != SD_ZBC_INIT) {
> - for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> - unsigned long flags;
> -
> - zone = rb_entry(node, struct blk_zone, node);
> - if (num_rec == 0)
> + unsigned long flags;
> + int iter;
> + struct zone_wps *zi = q->zones;
> + struct contiguous_wps *wp = NULL;
> + u64 index = -1;
> + int zone_busy = 0;
> + int z_flgd = 0;
> +
> + for (iter = 0; iter < zi->wps_count; iter++) {
> + if (sector >= zi->wps[iter]->start_lba &&
> + sector < zi->wps[iter]->last_lba) {
> + wp = zi->wps[iter];
> break;
> - if (zone->start != next_sector)
> - continue;
> - next_sector += zone->len;
> - num_rec--;
> -
> - spin_lock_irqsave(&zone->lock, flags);
> - if (blk_zone_is_smr(zone)) {
> - if (zone->state == BLK_ZONE_BUSY) {
> + }
> + }
> + if (wp) {
> + spin_lock_irqsave(&wp->lock, flags);
> + index = (sector - wp->start_lba) / wp->zone_size;
> + while (index < wp->zone_count && z_flgd < num_rec) {
> + struct blk_zone *bzone = &wp->zones[index];
> +
> + index++;
> + z_flgd++;
> + if (!blk_zone_is_smr(bzone))
> + continue;
> +
> + if (bzone->state == BLK_ZONE_BUSY)
> zone_busy++;
> - } else {
> - zone->state = BLK_ZONE_BUSY;
> - zone->wp = zone->start;
> - }
> - zone_num++;
> + else
> + bzone->state = BLK_ZONE_BUSY;
> }
> - spin_unlock_irqrestore(&zone->lock, flags);
> + spin_unlock_irqrestore(&wp->lock, flags);
> }
> - if (zone_num && (zone_num == zone_busy)) {
> + if (z_flgd && (z_flgd == zone_busy)) {
> sd_zbc_debug(sdkp,
> "zone update for %zu in progress\n",
> sector);
> @@ -476,43 +393,26 @@ static void discard_or_write_same(struct scsi_cmnd *cmd, sector_t sector,
> int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
> {
> struct request *rq = cmd->request;
> - struct scsi_device *sdp = cmd->device;
> struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
> sector_t sector = blk_rq_pos(rq);
> unsigned int nr_sectors = blk_rq_sectors(rq);
> int ret = BLKPREP_OK;
> struct blk_zone *zone;
> unsigned long flags;
> - u32 wp_offset;
> bool use_write_same = false;
> + sector_t z_start, z_len;
> + spinlock_t *lck;
>
> - zone = blk_lookup_zone(rq->q, sector);
> - if (!zone) {
> - /* Test for a runt zone before giving up */
> - if (sdp->type != TYPE_ZBC) {
> - struct request_queue *q = rq->q;
> - struct rb_node *node;
> -
> - node = rb_last(&q->zones);
> - if (node)
> - zone = rb_entry(node, struct blk_zone, node);
> - if (zone) {
> - spin_lock_irqsave(&zone->lock, flags);
> - if ((zone->start + zone->len) <= sector)
> - goto out;
> - spin_unlock_irqrestore(&zone->lock, flags);
> - zone = NULL;
> - }
> - }
> + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
> + if (!zone)
> return BLKPREP_KILL;
> - }
>
> - spin_lock_irqsave(&zone->lock, flags);
> + spin_lock_irqsave(lck, flags);
> if (zone->state == BLK_ZONE_UNKNOWN ||
> zone->state == BLK_ZONE_BUSY) {
> sd_zbc_debug_ratelimit(sdkp,
> "Discarding zone %zx state %x, deferring\n",
> - zone->start, zone->state);
> + z_start, zone->state);
> ret = BLKPREP_DEFER;
> goto out;
> }
> @@ -520,39 +420,37 @@ int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
> /* let the drive fail the command */
> sd_zbc_debug_ratelimit(sdkp,
> "Discarding offline zone %zx\n",
> - zone->start);
> + z_start);
> goto out;
> }
> if (blk_zone_is_cmr(zone)) {
> use_write_same = true;
> sd_zbc_debug_ratelimit(sdkp,
> - "Discarding CMR zone %zx\n",
> - zone->start);
> + "Discarding CMR zone %zx\n", z_start);
> goto out;
> }
> - if (zone->start != sector || zone->len < nr_sectors) {
> + if (z_start != sector || z_len < nr_sectors) {
> sd_printk(KERN_ERR, sdkp,
> "Misaligned RESET WP %zx/%x on zone %zx/%zx\n",
> - sector, nr_sectors, zone->start, zone->len);
> + sector, nr_sectors, z_start, z_len);
> ret = BLKPREP_KILL;
> goto out;
> }
> /* Protect against Reset WP when more data had been written to the
> * zone than is being discarded.
> */
> - wp_offset = zone->wp - zone->start;
> - if (wp_offset > nr_sectors) {
> + if (zone->wp > nr_sectors) {
> sd_printk(KERN_ERR, sdkp,
> - "Will Corrupt RESET WP %zx/%x/%x on zone %zx/%zx/%zx\n",
> - sector, wp_offset, nr_sectors,
> - zone->start, zone->wp, zone->len);
> + "Will Corrupt RESET WP %zx/%zx/%x on zone %zx/%zx/%zx\n",
> + sector, (sector_t)zone->wp, nr_sectors,
> + z_start, z_start + zone->wp, z_len);
> ret = BLKPREP_KILL;
> goto out;
> }
> if (blk_zone_is_empty(zone)) {
> sd_zbc_debug_ratelimit(sdkp,
> "Discarding empty zone %zx [WP: %zx]\n",
> - zone->start, zone->wp);
> + z_start, (sector_t)zone->wp);
> ret = BLKPREP_DONE;
> goto out;
> }
> @@ -563,8 +461,8 @@ out:
> * zone update if RESET WRITE POINTER fails.
> */
> if (ret == BLKPREP_OK && !use_write_same)
> - zone->wp = zone->start;
> - spin_unlock_irqrestore(&zone->lock, flags);
> + zone->wp = 0;
> + spin_unlock_irqrestore(lck, flags);
>
> if (ret == BLKPREP_OK)
> discard_or_write_same(cmd, sector, nr_sectors, use_write_same);
> @@ -573,13 +471,14 @@ out:
> }
>
>
> -static void __set_zone_state(struct blk_zone *zone, int op)
> +static void __set_zone_state(struct blk_zone *zone, sector_t z_len,
> + spinlock_t *lck, int op)
> {
> unsigned long flags;
>
> - spin_lock_irqsave(&zone->lock, flags);
> - if (blk_zone_is_cmr(zone))
> - goto out_unlock;
> + spin_lock_irqsave(lck, flags);
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + goto out;
>
> switch (op) {
> case REQ_OP_ZONE_OPEN:
> @@ -587,38 +486,45 @@ static void __set_zone_state(struct blk_zone *zone, int op)
> break;
> case REQ_OP_ZONE_FINISH:
> zone->state = BLK_ZONE_FULL;
> - zone->wp = zone->start + zone->len;
> + zone->wp = z_len;
> break;
> case REQ_OP_ZONE_CLOSE:
> zone->state = BLK_ZONE_CLOSED;
> break;
> case REQ_OP_ZONE_RESET:
> - zone->wp = zone->start;
> + zone->wp = 0;
> break;
> default:
> WARN_ONCE(1, "%s: invalid op code: %u\n", __func__, op);
> }
> -out_unlock:
> - spin_unlock_irqrestore(&zone->lock, flags);
> +out:
> + spin_unlock_irqrestore(lck, flags);
> }
>
> static void update_zone_state(struct request *rq, sector_t lba, unsigned int op)
> {
> - struct request_queue *q = rq->q;
> - struct blk_zone *zone = NULL;
> + struct blk_zone *zone;
>
> if (lba == ~0ul) {
> - struct rb_node *node;
> -
> - for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> - zone = rb_entry(node, struct blk_zone, node);
> - __set_zone_state(zone, op);
> + struct zone_wps *zi = rq->q->zones;
> + struct contiguous_wps *wp;
> + u32 iter, entry;
> +
> + for (iter = 0; iter < zi->wps_count; iter++) {
> + wp = zi->wps[iter];
> + for (entry = 0; entry < wp->zone_count; entry++) {
> + zone = &wp->zones[entry];
> + __set_zone_state(zone, wp->zone_size, &wp->lock,
> + op);
> + }
> }
> - return;
> } else {
> - zone = blk_lookup_zone(q, lba);
> + sector_t z_start, z_len;
> + spinlock_t *lck;
> +
> + zone = blk_lookup_zone(rq->q, lba, &z_start, &z_len, &lck);
> if (zone)
> - __set_zone_state(zone, op);
> + __set_zone_state(zone, z_len, lck, op);
> }
> }
>
> @@ -641,6 +547,8 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
> struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
> sector_t sector = blk_rq_pos(rq);
> struct blk_zone *zone;
> + spinlock_t *lck;
> + sector_t z_start, z_len;
> unsigned long flags;
> unsigned int nr_sectors;
> int ret = BLKPREP_DONE;
> @@ -651,17 +559,17 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
> if (is_fua || op != REQ_OP_ZONE_RESET)
> goto out;
>
> - zone = blk_lookup_zone(rq->q, sector);
> + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
> if (!zone || sdkp->provisioning_mode != SD_ZBC_RESET_WP)
> goto out;
>
> /* Map a Reset WP w/o FUA to a discard request */
> - spin_lock_irqsave(&zone->lock, flags);
> - sector = zone->start;
> - nr_sectors = zone->len;
> + spin_lock_irqsave(lck, flags);
> + sector = z_start;
> + nr_sectors = z_len;
> if (blk_zone_is_cmr(zone))
> use_write_same = true;
> - spin_unlock_irqrestore(&zone->lock, flags);
> + spin_unlock_irqrestore(lck, flags);
>
> rq->completion_data = NULL;
> if (use_write_same) {
> @@ -712,137 +620,157 @@ static sector_t bzrpt_fill(struct request *rq,
> struct bdev_zone_descriptor *bzd,
> size_t sz, sector_t lba, u8 opt)
> {
> - struct request_queue *q = rq->q;
> struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
> + struct scsi_device *sdp = sdkp->device;
> + struct zone_wps *zi = rq->q->zones;
> + struct contiguous_wps *wpdscr;
> struct blk_zone *zone = NULL;
> - struct rb_node *node = NULL;
> sector_t progress = lba;
> sector_t clen = ~0ul;
> + sector_t z_start, z_len, z_wp_abs;
> unsigned long flags;
> u32 max_entries = bzrpt ? max_report_entries(sz) : sz / sizeof(*bzd);
> u32 entry = 0;
> + u32 iter, idscr;
> int len_diffs = 0;
> int type_diffs = 0;
> u8 ctype;
> u8 same = 0;
>
> - zone = blk_lookup_zone(q, lba);
> - if (zone)
> - node = &zone->node;
> -
> - for (entry = 0; entry < max_entries && node; node = rb_next(node)) {
> - u64 z_len, z_start, z_wp_abs;
> - u8 cond = 0;
> - u8 flgs = 0;
> -
> - spin_lock_irqsave(&zone->lock, flags);
> - z_len = zone->len;
> - z_start = zone->start;
> - z_wp_abs = zone->wp;
> - progress = z_start + z_len;
> - cond = zone->state;
> - if (blk_zone_is_cmr(zone))
> - flgs |= 0x02;
> - else if (zone->wp != zone->start)
> - flgs |= 0x01; /* flag as RWP recommended? */
> - spin_unlock_irqrestore(&zone->lock, flags);
> -
> - switch (opt & ZBC_REPORT_OPTION_MASK) {
> - case ZBC_ZONE_REPORTING_OPTION_EMPTY:
> - if (z_wp_abs != z_start)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
> - if (cond != BLK_ZONE_OPEN)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
> - if (cond != BLK_ZONE_OPEN_EXPLICIT)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_CLOSED:
> - if (cond != BLK_ZONE_CLOSED)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_FULL:
> - if (cond != BLK_ZONE_FULL)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_READONLY:
> - if (cond == BLK_ZONE_READONLY)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
> - if (cond == BLK_ZONE_OFFLINE)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
> - if (z_wp_abs == z_start)
> - continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_NON_WP:
> - if (cond == BLK_ZONE_NO_WP)
> + for (iter = 0; entry < max_entries && iter < zi->wps_count; iter++) {
> + wpdscr = zi->wps[iter];
> + if (lba > wpdscr->last_lba)
> + continue;
> +
> + spin_lock_irqsave(&wpdscr->lock, flags);
> + for (idscr = 0;
> + entry < max_entries && idscr < wpdscr->zone_count;
> + idscr++) {
> + struct bdev_zone_descriptor *dscr;
> + u64 zoff = idscr * wpdscr->zone_size;
> + u8 cond, flgs = 0;
> +
> + z_len = wpdscr->zone_size;
> + zoff = idscr * z_len;
> + z_start = wpdscr->start_lba + zoff;
> + if (lba >= z_start + z_len)
> continue;
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
> - /* this can only be reported by the HW */
> - break;
> - case ZBC_ZONE_REPORTING_OPTION_ALL:
> - default:
> - break;
> - }
>
> - /* if same code only applies to returned zones */
> - if (opt & ZBC_REPORT_ZONE_PARTIAL) {
> - if (clen != ~0ul) {
> - clen = z_len;
> + zone = &wpdscr->zones[idscr];
> + if (blk_zone_is_cmr(zone))
> + z_wp_abs = z_start + wpdscr->zone_size;
> + else
> + z_wp_abs = z_start + zone->wp;
> +
> + switch (opt & ZBC_REPORT_OPTION_MASK) {
> + case ZBC_ZONE_REPORTING_OPTION_EMPTY:
> + if (z_wp_abs != z_start)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
> + if (zone->state != BLK_ZONE_OPEN)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
> + if (zone->state != BLK_ZONE_OPEN_EXPLICIT)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_CLOSED:
> + if (zone->state != BLK_ZONE_CLOSED)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_FULL:
> + if (zone->state != BLK_ZONE_FULL)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_READONLY:
> + if (zone->state == BLK_ZONE_READONLY)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
> + if (zone->state == BLK_ZONE_OFFLINE)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
> + if (z_wp_abs == z_start)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_NON_WP:
> + if (zone->state == BLK_ZONE_NO_WP)
> + continue;
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
> + /* this can only be reported by the HW */
> + break;
> + case ZBC_ZONE_REPORTING_OPTION_ALL:
> + default:
> + break;
> + }
> +
> + /* if same code only applies to returned zones */
> + if (opt & ZBC_REPORT_ZONE_PARTIAL) {
> + if (clen != ~0ul) {
> + clen = z_len;
> + ctype = zone->type;
> + }
> + if (z_len != clen)
> + len_diffs++;
> + if (zone->type != ctype)
> + type_diffs++;
> ctype = zone->type;
> }
> - if (z_len != clen)
> - len_diffs++;
> - if (zone->type != ctype)
> - type_diffs++;
> - ctype = zone->type;
> - }
> + progress = z_start + z_len;
>
> - /* shift to device units */
> - z_start >>= ilog2(sdkp->device->sector_size) - 9;
> - z_len >>= ilog2(sdkp->device->sector_size) - 9;
> - z_wp_abs >>= ilog2(sdkp->device->sector_size) - 9;
> + if (!bzd) {
> + if (bzrpt)
> + bzrpt->descriptor_count =
> + cpu_to_be32(++entry);
> + continue;
> + }
>
> - if (!bzd) {
> + /* shift to device units */
> + z_start >>= ilog2(sdp->sector_size) - 9;
> + z_len >>= ilog2(sdp->sector_size) - 9;
> + z_wp_abs >>= ilog2(sdp->sector_size) - 9;
> +
> + cond = zone->state;
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + flgs |= 0x02;
> + else if (zone->wp)
> + flgs |= 0x01; /* flag as RWP recommended? */
> +
> + dscr = &bzd[entry];
> + dscr->lba_start = cpu_to_be64(z_start);
> + dscr->length = cpu_to_be64(z_len);
> + dscr->lba_wptr = cpu_to_be64(z_wp_abs);
> + dscr->type = zone->type;
> + dscr->flags = cond << 4 | flgs;
> + entry++;
> if (bzrpt)
> - bzrpt->descriptor_count =
> - cpu_to_be32(++entry);
> - continue;
> + bzrpt->descriptor_count = cpu_to_be32(entry);
> }
> -
> - bzd[entry].lba_start = cpu_to_be64(z_start);
> - bzd[entry].length = cpu_to_be64(z_len);
> - bzd[entry].lba_wptr = cpu_to_be64(z_wp_abs);
> - bzd[entry].type = zone->type;
> - bzd[entry].flags = cond << 4 | flgs;
> - entry++;
> - if (bzrpt)
> - bzrpt->descriptor_count = cpu_to_be32(entry);
> + spin_unlock_irqrestore(&wpdscr->lock, flags);
> }
>
> /* if same code applies to all zones */
> if (bzrpt && !(opt & ZBC_REPORT_ZONE_PARTIAL)) {
> - for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> - zone = rb_entry(node, struct blk_zone, node);
> -
> - spin_lock_irqsave(&zone->lock, flags);
> - if (clen != ~0ul) {
> - clen = zone->len;
> + for (iter = 0; iter < zi->wps_count; iter++) {
> + wpdscr = zi->wps[iter];
> + spin_lock_irqsave(&wpdscr->lock, flags);
> + for (idscr = 0; idscr < wpdscr->zone_count; idscr++) {
> + z_len = wpdscr->zone_size;
> + zone = &wpdscr->zones[idscr];
> + if (clen != ~0ul) {
> + clen = z_len;
> + ctype = zone->type;
> + }
> + if (z_len != clen)
> + len_diffs++;
> + if (zone->type != ctype)
> + type_diffs++;
> ctype = zone->type;
> }
> - if (zone->len != clen)
> - len_diffs++;
> - if (zone->type != ctype)
> - type_diffs++;
> - ctype = zone->type;
> - spin_unlock_irqrestore(&zone->lock, flags);
> + spin_unlock_irqrestore(&wpdscr->lock, flags);
> }
> }
>
> @@ -985,12 +913,15 @@ out:
> int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> sector_t sector, unsigned int *num_sectors)
> {
> + struct request_queue *q = sdkp->disk->queue;
> struct blk_zone *zone;
> + sector_t z_start, z_len;
> + spinlock_t *lck;
> unsigned int sectors = *num_sectors;
> int ret = BLKPREP_OK;
> unsigned long flags;
>
> - zone = blk_lookup_zone(sdkp->disk->queue, sector);
> + zone = blk_lookup_zone(q, sector, &z_start, &z_len, &lck);
> if (!zone) {
> /* Might happen during zone initialization */
> sd_zbc_debug_ratelimit(sdkp,
> @@ -999,7 +930,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> return BLKPREP_OK;
> }
>
> - spin_lock_irqsave(&zone->lock, flags);
> + spin_lock_irqsave(lck, flags);
>
> if (blk_zone_is_cmr(zone))
> goto out;
> @@ -1008,7 +939,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> zone->state == BLK_ZONE_BUSY) {
> sd_zbc_debug_ratelimit(sdkp,
> "zone %zu state %x, deferring\n",
> - zone->start, zone->state);
> + z_start, zone->state);
> ret = BLKPREP_DEFER;
> goto out;
> }
> @@ -1017,25 +948,22 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> if (op_is_write(req_op(rq))) {
> u64 nwp = sector + sectors;
>
> - while (nwp > (zone->start + zone->len)) {
> - struct rb_node *node = rb_next(&zone->node);
> + while (nwp > (z_start + z_len)) {
> + zone->wp = z_len;
> + sector = z_start + z_len;
> + sectors = nwp - sector;
> + spin_unlock_irqrestore(lck, flags);
>
> - zone->wp = zone->start + zone->len;
> - sector = zone->wp;
> - sectors = nwp - zone->wp;
> - spin_unlock_irqrestore(&zone->lock, flags);
> -
> - if (!node)
> - return BLKPREP_OK;
> - zone = rb_entry(node, struct blk_zone, node);
> + zone = blk_lookup_zone(q, sector,
> + &z_start, &z_len, &lck);
> if (!zone)
> return BLKPREP_OK;
>
> - spin_lock_irqsave(&zone->lock, flags);
> + spin_lock_irqsave(lck, flags);
> nwp = sector + sectors;
> }
> - if (nwp > zone->wp)
> - zone->wp = nwp;
> + if (nwp > z_start + zone->wp)
> + zone->wp = nwp - z_start;
> }
> goto out;
> }
> @@ -1044,37 +972,37 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> /* let the drive fail the command */
> sd_zbc_debug_ratelimit(sdkp,
> "zone %zu offline\n",
> - zone->start);
> + z_start);
> goto out;
> }
>
> if (op_is_write(req_op(rq))) {
> if (zone->state == BLK_ZONE_READONLY)
> goto out;
> - if (blk_zone_is_full(zone)) {
> + if (zone->wp == z_len) {
> sd_zbc_debug(sdkp,
> - "Write to full zone %zu/%zu\n",
> - sector, zone->wp);
> + "Write to full zone %zu/%zu/%zu\n",
> + sector, (sector_t)zone->wp, z_len);
> ret = BLKPREP_KILL;
> goto out;
> }
> - if (zone->wp != sector) {
> + if (sector != (z_start + zone->wp)) {
> sd_zbc_debug(sdkp,
> "Misaligned write %zu/%zu\n",
> - sector, zone->wp);
> + sector, z_start + zone->wp);
> ret = BLKPREP_KILL;
> goto out;
> }
> zone->wp += sectors;
> - } else if (zone->wp <= sector + sectors) {
> - if (zone->wp <= sector) {
> + } else if (z_start + zone->wp <= sector + sectors) {
> + if (z_start + zone->wp <= sector) {
> /* Read beyond WP: clear request buffer */
> struct req_iterator iter;
> struct bio_vec bvec;
> void *buf;
> sd_zbc_debug(sdkp,
> "Read beyond wp %zu+%u/%zu\n",
> - sector, sectors, zone->wp);
> + sector, sectors, z_start + zone->wp);
> rq_for_each_segment(bvec, rq, iter) {
> buf = bvec_kmap_irq(&bvec, &flags);
> memset(buf, 0, bvec.bv_len);
> @@ -1085,15 +1013,15 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
> goto out;
> }
> /* Read straddle WP position: limit request size */
> - *num_sectors = zone->wp - sector;
> + *num_sectors = z_start + zone->wp - sector;
> sd_zbc_debug(sdkp,
> "Read straddle wp %zu+%u/%zu => %zu+%u\n",
> - sector, sectors, zone->wp,
> + sector, sectors, z_start + zone->wp,
> sector, *num_sectors);
> }
>
> out:
> - spin_unlock_irqrestore(&zone->lock, flags);
> + spin_unlock_irqrestore(lck, flags);
>
> return ret;
> }
> @@ -1145,21 +1073,22 @@ static void update_zones_from_report(struct scsi_cmnd *cmd, u32 nr_bytes)
> struct bdev_zone_descriptor *entry = &bzde[iter];
> sector_t s = get_start_from_desc(sdkp, entry);
> sector_t z_len = get_len_from_desc(sdkp, entry);
> + sector_t z_strt;
> + spinlock_t *lck;
> unsigned long flags;
>
> if (!z_len)
> goto done;
>
> - zone = blk_lookup_zone(rq->q, s);
> + zone = blk_lookup_zone(rq->q, s, &z_strt, &z_len, &lck);
> if (!zone)
> goto done;
>
> - spin_lock_irqsave(&zone->lock, flags);
> + spin_lock_irqsave(lck, flags);
> zone->type = entry->type & 0xF;
> zone->state = (entry->flags >> 4) & 0xF;
> zone->wp = get_wp_from_desc(sdkp, entry);
> - zone->len = z_len;
> - spin_unlock_irqrestore(&zone->lock, flags);
> + spin_unlock_irqrestore(lck, flags);
> }
> nread += len;
> if (!dmax)
> @@ -1233,113 +1162,314 @@ void sd_zbc_uninit_command(struct scsi_cmnd *cmd)
> }
>
> /**
> - * sd_zbc_init - Load zones of matching zlen size into rb tree.
> + * alloc_cpws() - Allocate space for a contiguous set of write pointers
> + * @items: Number of wps needed.
> + * @lba: lba of the start of the next zone.
> + * @z_start: Starting lba of this contiguous set.
> + * @z_size: Size of each zone this contiguous set.
> *
> + * Return: Allocated wps or NULL on error.
> */
> -static int sd_zbc_init(struct scsi_disk *sdkp, u64 zlen, char *buf, int buf_len)
> +static struct contiguous_wps *alloc_cpws(int items, u64 lba, u64 z_start,
> + u64 z_size)
> {
> - sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> - sector_t last_sector;
> + struct contiguous_wps *cwps = NULL;
> + size_t sz;
>
> - if (test_and_set_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags)) {
> - sdev_printk(KERN_WARNING, sdkp->device,
> - "zone initialization already running\n");
> - return 0;
> + sz = sizeof(struct contiguous_wps) + (items * sizeof(struct blk_zone));
> + if (items) {
> + cwps = vzalloc(sz);
> + if (!cwps)
> + goto out;
> + spin_lock_init(&cwps->lock);
> + cwps->start_lba = z_start;
> + cwps->last_lba = lba - 1;
> + cwps->zone_size = z_size;
> + cwps->is_zoned = items > 1 ? 1 : 0;
> + cwps->zone_count = items;
> }
>
> - if (!sdkp->zone_work_q) {
> - char wq_name[32];
> +out:
> + return cwps;
> +}
>
> - sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
> - sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
> - if (!sdkp->zone_work_q) {
> - sdev_printk(KERN_WARNING, sdkp->device,
> - "create zoned disk workqueue failed\n");
> - return -ENOMEM;
> +/**
> + * free_zone_wps() - Free up memory in use by wps
> + * @zi: zone wps array(s).
> + */
> +static void free_zone_wps(struct zone_wps *zi)
> +{
> + /* on error free the arrays */
> + if (zi && zi->wps) {
> + int ca;
> +
> + for (ca = 0; ca < zi->wps_count; ca++) {
> + if (zi->wps[ca]) {
> + vfree(zi->wps[ca]);
> + zi->wps[ca] = NULL;
> + }
> }
> - } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> - drain_workqueue(sdkp->zone_work_q);
> - clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
> + kfree(zi->wps);
> }
> +}
>
> - last_sector = zbc_parse_zones(sdkp, zlen, buf, buf_len);
> - capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> - if (last_sector != -1 && last_sector < capacity) {
> - sd_zbc_update_zones(sdkp, last_sector,
> - SD_ZBC_BUF_SIZE, SD_ZBC_INIT);
> - } else
> - clear_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags);
> +static int wps_realloc(struct zone_wps *zi, gfp_t gfp_mask)
> +{
> + int rcode = 0;
> + struct contiguous_wps **old;
> + struct contiguous_wps **tmp;
> + int n = zi->wps_count * 2;
> +
> + old = zi->wps;
> + tmp = kzalloc(n, sizeof(*zi->wps), gfp_mask);

Apologies, should be kcalloc() here.

> + if (!tmp) {
> + rcode = -ENOMEM;
> + goto out;
> + }
> + memcpy(tmp, zi->wps, zi->wps_count * sizeof(*zi->wps));
> + zi->wps = tmp;
> + kfree(old);
>
> - return 0;
> +out:
> + return rcode;
> }
>
> +#define FMT_CHANGING_CAPACITY "Changing capacity from %zu to Max LBA+1 %zu"
> +
> /**
> - * sd_zbc_config() - Configure a ZBC device (on attach)
> - * @sdkp: SCSI disk being attached.
> - * @buffer: Buffer to working data.
> - * @buf_sz: Size of buffer to use for working data
> + * zbc_init_zones() - Re-Sync expected WP location with drive
> + * @sdkp: scsi_disk
> + * @gfp_mask: Allocation mask.
> *
> - * Return: true of SD_ZBC_RESET_WP provisioning is supported
> + * Return: 0 on success, otherwise error.
> */
> -bool sd_zbc_config(struct scsi_disk *sdkp, void *buffer, size_t buf_sz)
> +int zbc_init_zones(struct scsi_disk *sdkp, gfp_t gfp_mask)
> {
> - struct bdev_zone_report *bzrpt = buffer;
> - u64 zone_len, lba;
> - int retval;
> - u32 rep_len;
> - u8 same;
> + struct request_queue *q = sdkp->disk->queue;
> + int rcode = 0;
> + int entry = 0;
> + int offset;
> + int offmax;
> + u64 iter;
> + u64 z_start = 0ul;
> + u64 z_size = 0; /* size of zone */
> + int z_count = 0; /* number of zones of z_size */
> + int do_fill = 0;
> + int array_count = 0;
> + int one_time_setup = 0;
> + u8 opt = ZBC_ZONE_REPORTING_OPTION_ALL;
> + size_t bufsz = SD_ZBC_BUF_SIZE;
> + struct bdev_zone_report *rpt = NULL;
> + struct zone_wps *zi = NULL;
> + struct contiguous_wps *cwps = NULL;
> +
> + if (q->zones)
> + goto out;
>
> - if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC)
> - /*
> - * Device managed or normal SCSI disk,
> - * no special handling required
> - */
> - return false;
> -
> - retval = sd_zbc_report_zones(sdkp, bzrpt, buf_sz,
> - 0, ZBC_ZONE_REPORTING_OPTION_ALL, false);
> - if (retval < 0)
> - return false;
> -
> - rep_len = be32_to_cpu(bzrpt->descriptor_count);
> - if (rep_len < 7) {
> - sd_printk(KERN_WARNING, sdkp,
> - "REPORT ZONES report invalid length %u\n",
> - rep_len);
> - return false;
> + zi = kzalloc(sizeof(*zi), gfp_mask);
> + if (!zi) {
> + rcode = -ENOMEM;
> + goto out;
> }
>
> - if (sdkp->rc_basis == 0) {
> - /* The max_lba field is the capacity of a zoned device */
> - lba = be64_to_cpu(bzrpt->maximum_lba);
> - if (lba + 1 > sdkp->capacity) {
> - if (sdkp->first_scan)
> - sd_printk(KERN_WARNING, sdkp,
> - "Changing capacity from %zu to Max LBA+1 %zu\n",
> - sdkp->capacity, (sector_t) lba + 1);
> - sdkp->capacity = lba + 1;
> + if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) {
> + struct gendisk *disk = sdkp->disk;
> +
> + zi->wps = kzalloc(sizeof(*zi->wps), gfp_mask);
> + zi->wps[0] = alloc_cpws(1, disk->part0.nr_sects, z_start, 1);
> + if (!zi->wps[0]) {
> + rcode = -ENOMEM;
> + goto out;
> }
> + zi->wps_count = 1;
> + goto out;
> + }
> +
> + rpt = kmalloc(bufsz, gfp_mask);
> + if (!rpt) {
> + rcode = -ENOMEM;
> + goto out;
> }
>
> /*
> - * Adjust 'chunk_sectors' to the zone length if the device
> - * supports equal zone sizes.
> + * Start by handling upto 32 different zone sizes. 2 will work
> + * for all the current drives, but maybe something exotic will
> + * surface.
> */
> - same = bzrpt->same_field & 0x0f;
> - if (same > 3) {
> - sd_printk(KERN_WARNING, sdkp,
> - "REPORT ZONES SAME type %d not supported\n", same);
> - return false;
> + zi->wps = kcalloc(32, sizeof(*zi->wps), gfp_mask);
> + zi->wps_count = 32;
> + if (!zi->wps) {
> + rcode = -ENOMEM;
> + goto out;
> }
> - /* Read the zone length from the first zone descriptor */
> - zone_len = be64_to_cpu(bzrpt->descriptors[0].length);
> - sdkp->unmap_alignment = zone_len;
> - sdkp->unmap_granularity = zone_len;
> - blk_queue_chunk_sectors(sdkp->disk->queue,
> - logical_to_sectors(sdkp->device, zone_len));
> -
> - sd_zbc_init(sdkp, zone_len, buffer, buf_sz);
> - return true;
> +
> +fill:
> + offset = 0;
> + offmax = 0;
> + for (entry = 0, iter = 0; iter < sdkp->capacity; entry++) {
> + struct bdev_zone_descriptor *bzde;
> + int stop_end = 0;
> + int stop_size = 0;
> +
> + if (offset == 0) {
> + int err;
> +
> + err = sd_zbc_report_zones(sdkp, rpt, bufsz, iter, opt);
> + if (err) {
> + pr_err("report zones-> %d\n", err);
> + if (err != -ENOTSUPP)
> + rcode = err;
> + goto out;
> + }
> + if (sdkp->rc_basis == 0) {
> + sector_t lba = be64_to_cpu(rpt->maximum_lba);
> +
> + if (lba + 1 > sdkp->capacity) {
> + sd_printk(KERN_WARNING, sdkp,
> + FMT_CHANGING_CAPACITY "\n",
> + sdkp->capacity, lba + 1);
> + sdkp->capacity = lba + 1;
> + }
> + }
> + offmax = max_report_entries(bufsz);
> + }
> + bzde = &rpt->descriptors[offset];
> + if (z_size == 0)
> + z_size = get_len_from_desc(sdkp, bzde);
> + if (z_size != get_len_from_desc(sdkp, bzde))
> + stop_size = 1;
> + if ((iter + z_size) >= sdkp->capacity)
> + stop_end = 1;
> +
> + if (!one_time_setup) {
> + u8 type = bzde->type & 0x0F;
> +
> + if (type != BLK_ZONE_TYPE_CONVENTIONAL) {
> + one_time_setup = 1;
> + blk_queue_chunk_sectors(sdkp->disk->queue,
> + z_size);
> + }
> + }
> +
> + if (do_fill == 0) {
> + if (stop_end || stop_size) {
> + /* include the next/last zone? */
> + if (!stop_size) {
> + z_count++;
> + iter += z_size;
> + }
> + cwps = alloc_cpws(z_count, iter,
> + z_start, z_size);
> + if (!cwps) {
> + rcode = -ENOMEM;
> + goto out;
> + }
> + if (array_count > 0)
> + cwps->is_zoned = 1;
> +
> + zi->wps[array_count] = cwps;
> + z_start = iter;
> + z_size = 0;
> + z_count = 0;
> + array_count++;
> + if (array_count >= zi->wps_count) {
> + rcode = wps_realloc(zi, gfp_mask);
> + if (rcode)
> + goto out;
> + }
> + /* add the runt zone */
> + if (stop_end && stop_size) {
> + z_count++;
> + z_size = get_len_from_desc(sdkp, bzde);
> + cwps = alloc_cpws(z_count,
> + iter + z_size,
> + z_start, z_size);
> + if (!cwps) {
> + rcode = -ENOMEM;
> + goto out;
> + }
> + if (array_count > 0)
> + cwps->is_zoned = 1;
> + zi->wps[array_count] = cwps;
> + array_count++;
> + }
> + if (stop_end) {
> + do_fill = 1;
> + array_count = 0;
> + z_count = 0;
> + z_size = 0;
> + goto fill;
> + }
> + }
> + z_size = get_len_from_desc(sdkp, bzde);
> + iter += z_size;
> + z_count++;
> + } else {
> + fill_zone(zi->wps[array_count], z_count, sdkp, bzde);
> + z_count++;
> + iter += z_size;
> + if (zi->wps[array_count]->zone_count == z_count) {
> + z_count = 0;
> + array_count++;
> + zi->wps_count = array_count;
> + }
> + }
> + offset++;
> + if (offset >= offmax)
> + offset = 0;
> + }
> +out:
> + kfree(rpt);
> +
> + if (rcode) {
> + if (zi) {
> + free_zone_wps(zi);
> + kfree(zi);
> + }
> + } else {
> + q->zones = zi;
> + }
> +
> + return rcode;
> +}
> +
> +/**
> + * sd_zbc_config() - Configure a ZBC device (on attach)
> + * @sdkp: SCSI disk being attached.
> + * @gfp_mask: Memory allocation strategy
> + *
> + * Return: true of SD_ZBC_RESET_WP provisioning is supported
> + */
> +bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp_mask)
> +{
> + bool can_reset_wp = false;
> +
> + if (zbc_init_zones(sdkp, gfp_mask)) {
> + sdev_printk(KERN_WARNING, sdkp->device,
> + "Initialize zone cache failed\n");
> + goto out;
> + }
> +
> + if (sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC)
> + can_reset_wp = true;
> +
> + if (!sdkp->zone_work_q) {
> + char wq_name[32];
> +
> + sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
> + sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
> + if (!sdkp->zone_work_q) {
> + sdev_printk(KERN_WARNING, sdkp->device,
> + "create zoned disk workqueue failed\n");
> + goto out;
> + }
> + } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> + drain_workqueue(sdkp->zone_work_q);
> + clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
> + }
> +
> +out:
> + return can_reset_wp;
> }
>
> /**
> @@ -1365,15 +1495,16 @@ void sd_zbc_remove(struct scsi_disk *sdkp)
> */
> unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp)
> {
> - unsigned int bytes = 1;
> struct request_queue *q = sdkp->disk->queue;
> - struct rb_node *node = rb_first(&q->zones);
> + struct zone_wps *zi = q->zones;
> + unsigned int bytes = 1;
>
> - if (node) {
> - struct blk_zone *zone = rb_entry(node, struct blk_zone, node);
> + if (zi && zi->wps_count > 0) {
> + struct contiguous_wps *wp = zi->wps[0];
>
> - bytes = zone->len;
> + bytes = wp->zone_size;
> }
> +
> bytes <<= ilog2(sdkp->device->sector_size);
> return bytes;
> }
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index d5cdb5d..113c5a8 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -264,27 +264,83 @@ struct blk_queue_tag {
>
> #ifdef CONFIG_BLK_DEV_ZONED
>
> +/**
> + * struct blk_zone - A single zone type/stats and WP offset.
> + *
> + * @wp: Holds the wp offset from the start of the zone.
> + * @type: Holds the zone type nibble.
> + * @state: Holds the zone state nibble + kernel (zone busy)
> + * @private_data: Used to hold whatever the implicit domain owner
> + * of the zone needs to track.
> + *
> + * Type is left at 4 bits (only 2 are needed currently) to match
> + * the current ZBC/ZAC standards.
> + *
> + * State is using 5 bits to accommodate the ZONE_BUSY. The first 4 bits
> + * match the current ZBC/ZAC spec.
> + * ZONE_BUSY could be mapped to one of the reserved bits. Using it as
> + * mask bit or independent flag my be useful for decoding the zone
> + * state before it transitioned to BUSY.
> + *
> + * A zone sized at order (39+9) is very unlikely (current zones are 16+9)
> + * Even at lba48 equivalent number of sectors we have a large amount
> + * of padding to fill out 8 bytes.
> + *
> + * Getting this to fit in 4 bytes would limit the maximum size of a zone
> + * to 4G [order 23 of 512 byte sectors + 9 bits for flags] which is probably
> + * okay for embedded or 32-bit systems where the private_data pointer
> + * would also shrink to 32 bits. There are also WP tracking schemes
> + * that don't make use of the private_data helper so perhaps that
> + * could be factored out as well.
> + */
> struct blk_zone {
> - struct rb_node node;
> - spinlock_t lock;
> - sector_t start;
> - size_t len;
> - sector_t wp;
> - enum blk_zone_type type;
> - enum blk_zone_state state;
> + unsigned long long wp:39;
> + unsigned long long type:4;
> + unsigned long long state:5;
> + unsigned long long padding:15;
> void *private_data;
> };
>
> +/**
> + * struct contiguous_wps - A descriptor of zones of the same size
> + *
> + * @start_lba: LBA of first zone covered by the descriptor.
> + * @last_lba: LBA of last zone.
> + * @zone_size: Size of zones as a number of 512 byte sectors.
> + * @zone_count: Number of zones (last-start/size) for convenience.
> + * @lock: A spinlock protecting these zones.
> + * @is_zoned: 0 when all zones are conventional no WP zones.
> + * zones: Array of blk_zone entries.
> + */
> +struct contiguous_wps {
> + u64 start_lba;
> + u64 last_lba;
> + u64 zone_size;
> + u32 zone_count;
> + spinlock_t lock;
> + unsigned is_zoned:1;
> + struct blk_zone zones[0];
> +};
> +
> +/**
> + * struct zone_wps - A collection of zone descriptors to describe zoned media.
> + *
> + * @wps_count: Number of descriptors.
> + * @wps: Array of zone descriptors.
> + */
> +struct zone_wps {
> + u32 wps_count;
> + struct contiguous_wps **wps;
> +};
> +
> #define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ)
> #define blk_zone_is_seq_pref(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_PREF)
> #define blk_zone_is_smr(z) (blk_zone_is_seq_req(z) || blk_zone_is_seq_pref(z))
> #define blk_zone_is_cmr(z) ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL)
> -#define blk_zone_is_full(z) ((z)->wp == (z)->start + (z)->len)
> -#define blk_zone_is_empty(z) ((z)->wp == (z)->start)
> +#define blk_zone_is_empty(z) ((z)->wp == 0)
>
> -extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t);
> -extern struct blk_zone *blk_insert_zone(struct request_queue *,
> - struct blk_zone *);
> +extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t,
> + sector_t *, sector_t *, spinlock_t **);
> extern void blk_drop_zones(struct request_queue *);
> #else
> static inline void blk_drop_zones(struct request_queue *q) { };
> @@ -463,7 +519,7 @@ struct request_queue {
> struct queue_limits limits;
>
> #ifdef CONFIG_BLK_DEV_ZONED
> - struct rb_root zones;
> + struct zone_wps *zones;
> #endif
> /*
> * sg stuff
> --
> 2.9.3
>



--
Shaun Tancheff