[PATCH 2/2] nvme: add emulation for zone-append

From: Kanchan Joshi
Date: Tue Aug 18 2020 - 01:33:12 EST


If drive does not support zone-append natively, enable emulation using
regular write.
Make emulated zone-append cmd write-lock the zone, preventing
concurrent append/write on the same zone.

To determine the start-lba for such writes, an array of 32 bit
zone-relative write-pointer (WP) positions is attached with namespace.
This cached WP-position is updated on successful completion as follows:
- APPEND/WRITE/WRITE_ZEROS/WRITE_SAME update it by number of sectors
(512b) copied
- ZONE_RESET updates it to 0 for target zone. ZONE_RESET_ALL does the
same for all zones.
- ZONE_FINISH sets it to zone-size.

On failed-completion for above requests, cached WP-position of target zone
is marked invalid. On subsequent zone-append to that zone, WP position is
refreshed by querying it from device (i.e. zone-report).

If emulated-append cannot immediately proceed due to zone write-lock
or invalid WP position, block-layer is asked to retry it.

Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx>
Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
Signed-off-by: SelvaKumar S <selvakuma.s1@xxxxxxxxxxx>
Signed-off-by: Javier Gonzalez <javier.gonz@xxxxxxxxxxx>
---
drivers/nvme/host/core.c | 41 +++++-
drivers/nvme/host/nvme.h | 60 ++++++++
drivers/nvme/host/zns.c | 306 ++++++++++++++++++++++++++++++++++++++-
3 files changed, 398 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4..78faddf444c3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -287,10 +287,17 @@ void nvme_complete_rq(struct request *req)
nvme_retry_req(req);
return;
}
- } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND) {
- req->__sector = nvme_lba_to_sect(req->q->queuedata,
- le64_to_cpu(nvme_req(req)->result.u64));
+ } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ bool need_wp_offset_update = false;
+ struct nvme_ns *ns = req->q->queuedata;
+ /* append-emulation requires wp update for some cmds*/
+ if (ns && nvme_is_append_emulated(ns))
+ need_wp_offset_update = nvme_need_zone_wp_update(req);
+ if (need_wp_offset_update)
+ nvme_zone_wp_update(ns, req, status);
+ else if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = nvme_lba_to_sect(ns,
+ le64_to_cpu(nvme_req(req)->result.u64));
}

nvme_trace_bio_complete(req, status);
@@ -456,6 +463,8 @@ static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);

+ if (nvme_is_append_emulated(ns))
+ nvme_teardown_append_emulate(ns);
if (ns->ndev)
nvme_nvm_unregister(ns);

@@ -809,7 +818,15 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
break;
case REQ_OP_ZONE_APPEND:
- ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+ if (!nvme_is_append_emulated(ns))
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+ else {
+ /* prepare append like write, and adjust lba afterwards */
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+ if (ret)
+ break;
+ ret = nvme_append_to_write(ns, req, cmd);
+ }
break;
default:
WARN_ON_ONCE(1);
@@ -2150,7 +2167,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;

- ret = blk_revalidate_disk_zones(disk, NULL);
+ ret = nvme_revalidate_disk_zones(disk);
if (!ret)
blk_queue_max_zone_append_sectors(disk->queue,
ctrl->max_zone_append);
@@ -3900,6 +3917,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (__nvme_revalidate_disk(disk, id))
goto out_put_disk;

+ /* setup append-emulation if required */
+ if (nvme_is_append_emulated(ns)) {
+ ret = nvme_setup_append_emulate(ns);
+ if (ret) {
+ dev_warn(ns->ctrl->device,
+ "append-emulation failed, zoned namespace:%d\n",
+ ns->head->ns_id);
+ nvme_clear_append_emulated(ns);
+ goto out_put_disk;
+ }
+ }
+
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
ret = nvme_nvm_register(ns, disk_name, node);
if (ret) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ebb8c3ed3885..c84d418fb001 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -421,6 +421,19 @@ enum nvme_ns_features {
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
};

+#ifdef CONFIG_BLK_DEV_ZONED
+struct nvme_za_emul {
+ unsigned int nr_zones;
+ spinlock_t zones_wp_offset_lock;
+ u32 *zones_wp_offset;
+ u32 *rev_wp_offset;
+ struct work_struct zone_wp_offset_work;
+ char *zone_wp_update_buf;
+ struct mutex rev_mutex;
+ struct nvme_ns *ns;
+};
+#endif
+
struct nvme_ns {
struct list_head list;

@@ -443,6 +456,10 @@ struct nvme_ns {
u8 pi_type;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
+ /* set if append needs to be emulated */
+ u8 append_emulate;
+ /* contains all other append-emulation fields */
+ struct nvme_za_emul *za_emul;
#endif
unsigned long features;
unsigned long flags;
@@ -759,9 +776,52 @@ int nvme_report_zones(struct gendisk *disk, sector_t sector,
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
+
+int nvme_revalidate_disk_zones(struct gendisk *disk);
+/* append-emulation only helpers */
+int nvme_setup_append_emulate(struct nvme_ns *ns);
+void nvme_teardown_append_emulate(struct nvme_ns *ns);
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+ struct nvme_command *cmd);
+bool nvme_need_zone_wp_update(struct request *rq);
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+ blk_status_t status);
+void nvme_set_append_emulated(struct nvme_ns *ns);
+void nvme_clear_append_emulated(struct nvme_ns *ns);
+int nvme_is_append_emulated(struct nvme_ns *ns);
#else
#define nvme_report_zones NULL

+static inline void nvme_set_append_emulated(struct nvme_ns *ns) {}
+
+static inline void nvme_clear_append_emulated(struct nvme_ns *ns) {}
+
+static inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+ return 0;
+}
+
+static inline int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+ return 0;
+}
+
+static inline void nvme_teardown_append_emulate(struct nvme_ns *ns) {}
+
+static inline blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+ struct nvme_command *cmd)
+{
+ return BLK_STS_NOTSUPP;
+}
+
+static inline bool nvme_need_zone_wp_update(struct request *rq)
+{
+ return false;
+}
+
+static inline void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+ blk_status_t status) {}
+
static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action)
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cabd870fb64e..0b1e9f62045a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,6 +7,10 @@
#include <linux/vmalloc.h>
#include "nvme.h"

+/* used for append-emulation */
+#define ZNS_INVALID_WP_OFST (~0u)
+#define ZNS_UPDATING_WP_OFST (ZNS_INVALID_WP_OFST - 1)
+
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
@@ -44,13 +48,14 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
struct nvme_id_ns_zns *id;
int status;

- /* Driver requires zone append support */
+ /* Driver does append-emulation if drive does not support zone-append */
if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
NVME_CMD_EFFECTS_CSUPP)) {
dev_warn(ns->ctrl->device,
- "append not supported for zoned namespace:%d\n",
+ "append is emulated for zoned namespace:%d\n",
ns->head->ns_id);
- return -EINVAL;
+ /* activate append-emulation */
+ nvme_set_append_emulated(ns);
}

/* Lazily query controller append limit for the first zoned namespace */
@@ -255,3 +260,298 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,

return BLK_STS_OK;
}
+
+static void nvme_revalidate_zones_cb(struct gendisk *disk)
+{
+ struct nvme_ns_head *head = NULL;
+ struct nvme_ns *ns;
+ int srcu_idx;
+
+ ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+ swap(ns->za_emul->zones_wp_offset, ns->za_emul->rev_wp_offset);
+ nvme_put_ns_from_disk(head, srcu_idx);
+}
+
+inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+ return (ns->append_emulate == 1);
+}
+
+inline void nvme_set_append_emulated(struct nvme_ns *ns)
+{
+ ns->append_emulate = 1;
+}
+
+inline void nvme_clear_append_emulated(struct nvme_ns *ns)
+{
+ ns->append_emulate = 0;
+}
+
+int nvme_revalidate_disk_zones(struct gendisk *disk)
+{
+ int ret = 0;
+ struct nvme_ns *ns = disk->private_data;
+
+ if (!nvme_is_append_emulated(ns))
+ ret = blk_revalidate_disk_zones(disk, NULL);
+ else {
+ struct nvme_za_emul *za_emul = ns->za_emul;
+ unsigned int nr_zones;
+
+ /* serialize multiple revalidate calls */
+ mutex_lock(&za_emul->rev_mutex);
+ nr_zones = get_capacity(disk) >> ilog2(ns->zsze);
+
+ /* avoid rescan zones if possible */
+ if (nr_zones == za_emul->nr_zones &&
+ disk->queue->nr_zones == nr_zones) {
+ mutex_unlock(&za_emul->rev_mutex);
+ goto out;
+ }
+ za_emul->rev_wp_offset = kvcalloc(nr_zones,
+ sizeof(u32), GFP_NOIO);
+ if (!za_emul->rev_wp_offset) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ret = blk_revalidate_disk_zones(disk,
+ nvme_revalidate_zones_cb);
+ /* rev_wp_offset has been swapped with zones_wp_offset */
+ kvfree(za_emul->rev_wp_offset);
+ za_emul->rev_wp_offset = NULL;
+unlock:
+ mutex_unlock(&za_emul->rev_mutex);
+ }
+out:
+ return ret;
+}
+
+static unsigned int nvme_get_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Offline and read-only zones do not have a valid
+ * write pointer. Use 0 as for an empty zone.
+ */
+ return 0;
+ }
+}
+
+static int nvme_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct nvme_za_emul *za_emul = data;
+
+ lockdep_assert_held(&za_emul->zones_wp_offset_lock);
+ za_emul->zones_wp_offset[idx] = nvme_get_zone_wp_offset(zone);
+ return 0;
+}
+
+static void nvme_update_wp_offset_workfn(struct work_struct *work)
+{
+ struct nvme_za_emul *za_emul;
+ struct nvme_ns *ns;
+ unsigned int zno;
+ unsigned long flags;
+ struct nvme_zone_report *report;
+ int buflen, ret;
+
+ buflen = sizeof(struct nvme_zone_report) +
+ sizeof(struct nvme_zone_descriptor);
+ za_emul = container_of(work, struct nvme_za_emul, zone_wp_offset_work);
+ ns = za_emul->ns;
+
+ spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+
+ for (zno = 0; zno < za_emul->nr_zones; zno++) {
+ if (za_emul->zones_wp_offset[zno] != ZNS_UPDATING_WP_OFST)
+ continue;
+ spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+
+ report = (struct nvme_zone_report *)za_emul->zone_wp_update_buf;
+ memset(report, 0, buflen);
+ ret = __nvme_ns_report_zones(ns, (zno * ns->zsze),
+ report,
+ buflen);
+
+ spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+ if (ret > 0)
+ nvme_zone_parse_entry(ns, &report->entries[0],
+ zno, nvme_update_wp_offset_cb,
+ za_emul);
+ }
+ spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+ /* remove the reference obtained earlier */
+ nvme_put_ns(ns);
+}
+
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+ struct nvme_command *cmd)
+{
+ blk_status_t ret = 0;
+ struct nvme_za_emul *za_emul = ns->za_emul;
+ unsigned int nr_sectors = (blk_rq_bytes(req) >> SECTOR_SHIFT);
+ unsigned int wp_offset, zno = blk_rq_zone_no(req);
+ sector_t lba = blk_rq_pos(req);
+ unsigned long flags;
+
+ if (!blk_req_zone_write_trylock(req))
+ return BLK_STS_RESOURCE;
+
+ spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+ wp_offset = za_emul->zones_wp_offset[zno];
+ switch (wp_offset) {
+ case ZNS_INVALID_WP_OFST:
+ /*
+ * update zone wp-offset in a deferred worker.
+ * postpone processing current request until worker manages
+ * to refresh wp by querying from device.
+ */
+ kref_get(&ns->kref);
+ za_emul->zones_wp_offset[zno] = ZNS_UPDATING_WP_OFST;
+ queue_work(nvme_wq, &za_emul->zone_wp_offset_work);
+ fallthrough;
+ case ZNS_UPDATING_WP_OFST:
+ ret = BLK_STS_RESOURCE;
+ break;
+ default:
+ if (wp_offset + nr_sectors > ns->zsze) {
+ ret = BLK_STS_IOERR;
+ break;
+ }
+ lba += wp_offset;
+ }
+ spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+ /* unlock zone in case of error, update lba otherwise */
+ if (ret)
+ blk_req_zone_write_unlock(req);
+ else
+ cmd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, lba));
+ return ret;
+}
+
+bool nvme_need_zone_wp_update(struct request *rq)
+{
+ switch (req_op(rq)) {
+ case REQ_OP_ZONE_APPEND:
+ case REQ_OP_ZONE_FINISH:
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ return true;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ return blk_rq_zone_is_seq(rq);
+ default:
+ return false;
+ }
+}
+
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+ blk_status_t status)
+{
+ struct nvme_za_emul *za_emul = ns->za_emul;
+ unsigned long flags;
+ unsigned int zno = blk_rq_zone_no(rq);
+ enum req_opf op = req_op(rq);
+ unsigned int res_bytes = blk_rq_bytes(rq);
+
+ spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+ /*
+ * Failure handling first, mark wp_offset invalid.
+ * This will force updating wp from device on subsequent access
+ */
+ if (status) {
+ if (op != REQ_OP_ZONE_RESET_ALL) {
+ if (za_emul->zones_wp_offset[zno] !=
+ ZNS_UPDATING_WP_OFST)
+ za_emul->zones_wp_offset[zno] = ZNS_INVALID_WP_OFST;
+
+ } else
+ memset(za_emul->zones_wp_offset, ZNS_INVALID_WP_OFST,
+ za_emul->nr_zones * sizeof(unsigned int));
+ goto unlock;
+ }
+ /* success case handling, update wp-offset */
+ switch (op) {
+ case REQ_OP_ZONE_APPEND:
+ rq->__sector += za_emul->zones_wp_offset[zno];
+ fallthrough;
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ /* every write should update the wp_offset */
+ if (za_emul->zones_wp_offset[zno] < ns->zsze)
+ za_emul->zones_wp_offset[zno] +=
+ res_bytes >> SECTOR_SHIFT;
+ break;
+ case REQ_OP_ZONE_RESET:
+ za_emul->zones_wp_offset[zno] = 0;
+ break;
+ case REQ_OP_ZONE_FINISH:
+ za_emul->zones_wp_offset[zno] = ns->zsze;
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ memset(za_emul->zones_wp_offset, 0,
+ za_emul->nr_zones * sizeof(unsigned int));
+ break;
+ default:
+ break;
+ }
+unlock:
+ spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+ /* release zone write-lock for append */
+ if (op == REQ_OP_ZONE_APPEND)
+ blk_req_zone_write_unlock(rq);
+}
+
+int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+ struct nvme_za_emul *za_emul;
+ size_t bufsize;
+
+ WARN_ON(ns->za_emul);
+ za_emul = kmalloc(sizeof(struct nvme_za_emul), GFP_KERNEL);
+ if (!za_emul)
+ return -ENOMEM;
+
+ za_emul->zones_wp_offset = NULL;
+ spin_lock_init(&za_emul->zones_wp_offset_lock);
+ za_emul->rev_wp_offset = NULL;
+ mutex_init(&za_emul->rev_mutex);
+ INIT_WORK(&za_emul->zone_wp_offset_work, nvme_update_wp_offset_workfn);
+ /* preallocate buffer for single zone-report */
+ bufsize = sizeof(struct nvme_zone_report) +
+ sizeof(struct nvme_zone_descriptor);
+ za_emul->zone_wp_update_buf = kzalloc(bufsize, GFP_KERNEL);
+ if (!za_emul->zone_wp_update_buf) {
+ kfree(za_emul);
+ return -ENOMEM;
+ }
+ za_emul->nr_zones = get_capacity(ns->disk) >> ilog2(ns->zsze);
+
+ ns->za_emul = za_emul;
+ za_emul->ns = ns;
+
+ return 0;
+}
+
+void nvme_teardown_append_emulate(struct nvme_ns *ns)
+{
+ WARN_ON(!ns->za_emul);
+ kvfree(ns->za_emul->zones_wp_offset);
+ kfree(ns->za_emul->zone_wp_update_buf);
+ ns->za_emul->zones_wp_offset = NULL;
+ ns->za_emul->rev_wp_offset = NULL;
+ kfree(ns->za_emul);
+}
--
2.17.1