Re: [PATCH v4 6/7] iommu/riscv: Command and fault queue support

From: Tomasz Jeznach
Date: Wed May 08 2024 - 12:03:45 EST


On Wed, May 8, 2024 at 8:39 AM Zong Li <zong.li@xxxxxxxxxx> wrote:
>
> On Sat, May 4, 2024 at 12:13 AM Tomasz Jeznach <tjeznach@rivosinccom> wrote:
> >
> > Introduce device command submission and fault reporting queues,
> > as described in Chapter 3.1 and 3.2 of the RISC-V IOMMU Architecture
> > Specification.
> >
> > Command and fault queues are instantiated in contiguous system memory
> > local to IOMMU device domain, or mapped from fixed I/O space provided
> > by the hardware implementation. Detection of the location and maximum
> > allowed size of the queue utilize WARL properties of queue base control
> > register. Driver implementation will try to allocate up to 128KB of
> > system memory, while respecting hardware supported maximum queue size.
> >
> > Interrupts allocation is based on interrupt vectors availability and
> > distributed to all queues in simple round-robin fashion. For hardware
> > Implementation with fixed event type to interrupt vector assignment
> > IVEC WARL property is used to discover such mappings.
> >
> > Address translation, command and queue fault handling in this change
> > is limited to simple fault reporting without taking any action.
> >
> > Reviewed-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
> > Signed-off-by: Tomasz Jeznach <tjeznach@xxxxxxxxxxxx>
> > ---
> > drivers/iommu/riscv/iommu-bits.h | 75 +++++
> > drivers/iommu/riscv/iommu.c | 496 ++++++++++++++++++++++++++++++-
> > drivers/iommu/riscv/iommu.h | 21 ++
> > 3 files changed, 590 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> > index ba093c29de9f..40c379222821 100644
> > --- a/drivers/iommu/riscv/iommu-bits.h
> > +++ b/drivers/iommu/riscv/iommu-bits.h
> > @@ -704,4 +704,79 @@ struct riscv_iommu_msi_pte {
> > #define RISCV_IOMMU_MSI_MRIF_NPPN RISCV_IOMMU_PPN_FIELD
> > #define RISCV_IOMMU_MSI_MRIF_NID_MSB BIT_ULL(60)
> >
> > +/* Helper functions: command structure builders. */
> > +
> > +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
> > +{
> > + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
> > + cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
> > + u64 addr)
> > +{
> > + cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
> > + cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
> > + int pscid)
> > +{
> > + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
> > + RISCV_IOMMU_CMD_IOTINVAL_PSCV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
> > + int gscid)
> > +{
> > + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
> > + RISCV_IOMMU_CMD_IOTINVAL_GV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
> > +{
> > + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > + RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
> > + cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
> > + u64 addr, u32 data)
> > +{
> > + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
> > + RISCV_IOMMU_CMD_IOFENCE_AV;
> > + cmd->dword1 = addr >> 2;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
> > +{
> > + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
> > + cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
> > +{
> > + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
> > + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
> > + cmd->dword1 = 0;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
> > + unsigned int devid)
> > +{
> > + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
> > + RISCV_IOMMU_CMD_IODIR_DV;
> > +}
> > +
> > +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
> > + unsigned int pasid)
> > +{
> > + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
> > +}
> > +
> > #endif /* _RISCV_IOMMU_BITS_H_ */
> > diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> > index 71b7903d83d4..4349ac8a3990 100644
> > --- a/drivers/iommu/riscv/iommu.c
> > +++ b/drivers/iommu/riscv/iommu.c
> > @@ -25,7 +25,14 @@
> > #include "iommu.h"
> >
> > /* Timeouts in [us] */
> > -#define RISCV_IOMMU_DDTP_TIMEOUT 50000
> > +#define RISCV_IOMMU_QCSR_TIMEOUT 150000
> > +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000
> > +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
> > +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
> > +
> > +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
> > +#define RISCV_IOMMU_DEF_CQ_COUNT 8192
> > +#define RISCV_IOMMU_DEF_FQ_COUNT 4096
> >
> > /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
> > #define phys_to_ppn(va) (((va) >> 2) & (((1ULL << 44) - 1) << 10))
> > @@ -89,6 +96,432 @@ static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
> > riscv_iommu_devres_pages_match, &devres);
> > }
> >
> > +/*
> > + * Hardware queue allocation and management.
> > + */
> > +
> > +/* Setup queue base, control registers and default queue length */
> > +#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
> > + struct riscv_iommu_queue *_q = q; \
> > + _q->qid = RISCV_IOMMU_INTR_ ## name; \
> > + _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
> > + _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
> > + _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
> > +} while (0)
> > +
> > +/* Note: offsets are the same for all queues */
> > +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
> > +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
> > +#define Q_ITEM(q, index) ((q)->mask & (index))
> > +#define Q_IPSR(q) BIT((q)->qid)
> > +
> > +/*
> > + * Discover queue ring buffer hardware configuration, allocate in-memory
> > + * ring buffer or use fixed I/O memory location, configure queue base register.
> > + * Must be called before hardware queue is enabled.
> > + *
> > + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
> > + * @entry_size - queue single element size in bytes.
> > + */
> > +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
> > + struct riscv_iommu_queue *queue,
> > + size_t entry_size)
> > +{
> > + unsigned int logsz;
> > + u64 qb, rb;
> > +
> > + /*
> > + * Use WARL base register property to discover maximum allowed
> > + * number of entries and optional fixed IO address for queue location.
> > + */
> > + riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOGSZ_FIELD);
> > + qb = riscv_iommu_readq(iommu, queue->qbr);
> > +
> > + /*
> > + * Calculate and verify hardware supported queue length, as reported
> > + * by the field LOGSZ, where max queue length is equal to 2^(LOGSZ + 1).
> > + * Update queue size based on hardware supported value.
> > + */
> > + logsz = ilog2(queue->mask);
> > + if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb))
> > + logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, qb);
> > +
> > + /*
> > + * Use WARL base register property to discover an optional fixed IO
> > + * address for queue ring buffer location. Otherwise allocate contigus
> > + * system memory.
> > + */
> > + if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
> > + const size_t queue_size = entry_size << (logsz + 1);
> > +
> > + queue->phys = ppn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
> > + queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
> > + } else {
> > + do {
> > + const size_t queue_size = entry_size << (logsz + 1);
> > + const int order = get_order(queue_size);
> > +
> > + queue->base = riscv_iommu_get_pages(iommu, order);
> > + queue->phys = __pa(queue->base);
> > + } while (!queue->base && logsz-- > 0);
> > + }
> > +
> > + if (!queue->base)
> > + return -ENOMEM;
> > +
> > + qb = phys_to_ppn(queue->phys) |
> > + FIELD_PREP(RISCV_IOMMU_QUEUE_LOGSZ_FIELD, logsz);
> > +
> > + /* Update base register and read back to verify hw accepted our write */
> > + riscv_iommu_writeq(iommu, queue->qbr, qb);
> > + rb = riscv_iommu_readq(iommu, queue->qbr);
> > + if (rb != qb) {
> > + dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
> > + return -ENODEV;
> > + }
> > +
> > + /* Update actual queue mask */
> > + queue->mask = (2U << logsz) - 1;
> > +
> > + dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
> > + queue->qid, logsz + 1);
> > +
> > + return 0;
> > +}
> > +
> > +/* Check interrupt queue status, IPSR */
> > +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
> > +{
> > + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > +
> > + if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
> > + return IRQ_WAKE_THREAD;
> > +
> > + return IRQ_NONE;
> > +}
> > +
> > +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
> > +{
> > + /* Reuse IVEC.CIV mask for all interrupt vectors mapping. */
> > + return (iommu->ivec >> (n * 4)) & RISCV_IOMMU_IVEC_CIV;
> > +}
> > +
> > +/*
> > + * Enable queue processing in the hardware, register interrupt handler.
> > + *
> > + * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
> > + * @irq_handler - threaded interrupt handler.
> > + */
> > +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
> > + struct riscv_iommu_queue *queue,
> > + irq_handler_t irq_handler)
> > +{
> > + const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
> > + u32 csr;
> > + int rc;
> > +
> > + if (queue->iommu)
> > + return -EBUSY;
> > +
> > + /* Polling not implemented */
> > + if (!irq)
> > + return -ENODEV;
> > +
> > + queue->iommu = iommu;
> > + rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
> > + IRQF_ONESHOT | IRQF_SHARED,
> > + dev_name(iommu->dev), queue);
> > + if (rc) {
> > + queue->iommu = NULL;
> > + return rc;
> > + }
> > +
> > + /*
> > + * Enable queue with interrupts, clear any memory fault if any.
> > + * Wait for the hardware to acknowledge request and activate queue
> > + * processing.
> > + * Note: All CSR bitfields are in the same offsets for all queues.
> > + */
> > + riscv_iommu_writel(iommu, queue->qcr,
> > + RISCV_IOMMU_QUEUE_ENABLE |
> > + RISCV_IOMMU_QUEUE_INTR_ENABLE |
> > + RISCV_IOMMU_QUEUE_MEM_FAULT);
> > +
> > + riscv_iommu_readl_timeout(iommu, queue->qcr,
> > + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > + 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > +
> > + if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
> > + RISCV_IOMMU_QUEUE_BUSY |
> > + RISCV_IOMMU_QUEUE_MEM_FAULT))) {
> > + /* Best effort to stop and disable failing hardware queue. */
> > + riscv_iommu_writel(iommu, queue->qcr, 0);
> > + free_irq(irq, queue);
> > + queue->iommu = NULL;
> > + dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
> > + return -EBUSY;
> > + }
> > +
> > + /* Clear any pending interrupt flag. */
> > + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > + return 0;
> > +}
> > +
> > +/*
> > + * Disable queue. Wait for the hardware to acknowledge request and
> > + * stop processing enqueued requests. Report errors but continue.
> > + */
> > +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
> > +{
> > + struct riscv_iommu_device *iommu = queue->iommu;
> > + u32 csr;
> > +
> > + if (!iommu)
> > + return;
> > +
> > + free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
> > + riscv_iommu_writel(iommu, queue->qcr, 0);
> > + riscv_iommu_readl_timeout(iommu, queue->qcr,
> > + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
> > + 10, RISCV_IOMMU_QCSR_TIMEOUT);
> > +
> > + if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
> > + dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
> > + queue->qid, csr);
> > +
> > + queue->iommu = NULL;
> > +}
> > +
> > +/*
> > + * Returns number of available valid queue entries and the first item index.
> > + * Update shadow producer index if necessary.
> > + */
> > +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
> > + unsigned int *index)
> > +{
> > + unsigned int head = atomic_read(&queue->head);
> > + unsigned int tail = atomic_read(&queue->tail);
> > + unsigned int last = Q_ITEM(queue, tail);
> > + int available = (int)(tail - head);
> > +
> > + *index = head;
> > +
> > + if (available > 0)
> > + return available;
> > +
> > + /* read hardware producer index, check reserved register bits are not set. */
> > + if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
> > + tail, (tail & ~queue->mask) == 0,
> > + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
> > + dev_err_once(queue->iommu->dev,
> > + "Hardware error: queue access timeout\n");
> > + return 0;
> > + }
> > +
> > + if (tail == last)
> > + return 0;
> > +
> > + /* update shadow producer index */
> > + return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
> > +}
> > +
> > +/*
> > + * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
> > + */
> > +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
> > +{
> > + const unsigned int head = atomic_add_return(count, &queue->head);
> > +
> > + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
> > +}
> > +
> > +/* Return actual consumer index based on hardware reported queue head index. */
> > +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
> > +{
> > + const unsigned int cons = atomic_read(&queue->head);
> > + const unsigned int last = Q_ITEM(queue, cons);
> > + unsigned int head;
> > +
> > + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > + !(head & ~queue->mask),
> > + 0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > + return cons;
> > +
> > + return cons + ((head - last) & queue->mask);
> > +}
> > +
> > +/* Wait for submitted item to be processed. */
> > +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
> > + unsigned int index,
> > + unsigned int timeout_us)
> > +{
> > + unsigned int cons = atomic_read(&queue->head);
> > +
> > + /* Already processed by the consumer */
> > + if ((int)(cons - index) > 0)
> > + return 0;
> > +
> > + /* Monitor consumer index */
> > + return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
> > + (int)(cons - index) > 0, 0, timeout_us);
> > +}
>
> Apart from iofence command, it seems to me that we might not be able
> to use the head pointer to determine whether the command is executed,
> because spec mentioned that the IOMMU advancing cqh is not a guarantee
> that the commands fetched by the IOMMU have been executed or
> committed. For iofence completion, perhaps using ADD and AV to write
> memory would be better as well.
>

I'll update the comment to the function that this is *only* allowed to
track IOFENCE.C completion. Call to riscv_iommu_queue_wait() is used
only for this purpose (unless specification will be extended to track
other commands with head/tail updates).

Previous version of the command tracking used address/AV notifier, but
after several iterations and testing we've considered tracking
head/tail as best approach.

> > +
> > +/* Enqueue an entry and wait to be processed if timeout_us > 0
> > + *
> > + * Error handling for IOMMU hardware not responding in reasonable time
> > + * will be added as separate patch series along with other RAS features.
> > + * For now, only report hardware failure and continue.
> > + */
> > +static void riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
> > + void *entry, size_t entry_size,
> > + unsigned int timeout_us)
> > +{
> > + unsigned int prod;
> > + unsigned int head;
> > + unsigned int tail;
> > + unsigned long flags;
> > +
> > + /* Do not preempt submission flow. */
> > + local_irq_save(flags);
> > +
> > + /* 1. Allocate some space in the queue */
> > + prod = atomic_inc_return(&queue->prod) - 1;
> > + head = atomic_read(&queue->head);
> > +
> > + /* 2. Wait for space availability. */
> > + if ((prod - head) > queue->mask) {
> > + if (readx_poll_timeout(atomic_read, &queue->head,
> > + head, (prod - head) < queue->mask,
> > + 0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > + goto err_busy;
> > + } else if ((prod - head) == queue->mask) {
> > + const unsigned int last = Q_ITEM(queue, head);
> > +
> > + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
> > + !(head & ~queue->mask) && head != last,
> > + 0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > + goto err_busy;
> > + atomic_add((head - last) & queue->mask, &queue->head);
> > + }
> > +
> > + /* 3. Store entry in the ring buffer. */
> > + memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
> > +
> > + /* 4. Wait for all previous entries to be ready */
> > + if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
> > + 0, RISCV_IOMMU_QUEUE_TIMEOUT))
> > + goto err_busy;
> > +
> > + /* 5. Complete submission and restore local interrupts */
> > + dma_wmb();
> > + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
> > + atomic_inc(&queue->tail);
> > + local_irq_restore(flags);
> > +
> > + if (timeout_us && riscv_iommu_queue_wait(queue, prod, timeout_us))
> > + dev_err_once(queue->iommu->dev,
> > + "Hardware error: command execution timeout\n");
> > +
> > + return;
> > +
> > +err_busy:
> > + local_irq_restore(flags);
> > + dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
> > +}
> > +
> > +/*
> > + * IOMMU Command queue chapter 3.1
> > + */
> > +
> > +/* Command queue interrupt handler thread function */
> > +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
> > +{
> > + const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > + unsigned int ctrl;
> > +
> > + /* Clear MF/CQ errors, complete error recovery to be implemented. */
> > + ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
> > + if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
> > + RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
> > + riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
>
> For RISCV_IOMMU_CQCSR_CMD_ILL and RISCV_IOMMU_CQCSR_CMD_TO, I think we
> need to adjust the head/tail pointer, otherwise, IOMMU might keep
> trying to execute the problematic command.
>
> > + dev_warn(queue->iommu->dev,
> > + "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
>
> It might be a bit weird if we view fence_w_ip as queue error and print
> the message as error?

Driver does not set WSI bit in IOFENCE.C command structure, so it's
unexpected to see fence_w_ip set. This message is warning only about
unexpected command queue interrupt, listing state of possible
triggers.

>
> > + queue->qid,
> > + !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
> > + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
> > + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
> > + !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
> > + }
> > +
> > + /* Placeholder for command queue interrupt notifiers */
> > +
> > + /* Clear command interrupt pending. */
> > + riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > + return IRQ_HANDLED;
> > +}
> > +
> > +/* Send command to the IOMMU command queue */
> > +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
> > + struct riscv_iommu_command *cmd,
> > + unsigned int timeout_us)
> > +{
> > + riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd), timeout_us);
> > +}
> > +
> > +/*
> > + * IOMMU Fault/Event queue chapter 3.2
> > + */
> > +
> > +static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
> > + struct riscv_iommu_fq_record *event)
> > +{
> > + unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
> > + unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
> > +
> > + /* Placeholder for future fault handling implementation, report only. */
> > + if (err)
> > + dev_warn_ratelimited(iommu->dev,
> > + "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
> > + err, devid, event->iotval, event->iotval2);
> > +}
> > +
> > +/* Fault queue interrupt handler thread function */
> > +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
> > +{
> > + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
> > + struct riscv_iommu_device *iommu = queue->iommu;
> > + struct riscv_iommu_fq_record *events;
> > + unsigned int ctrl, idx;
> > + int cnt, len;
> > +
> > + events = (struct riscv_iommu_fq_record *)queue->base;
> > +
> > + /* Clear fault interrupt pending and process all received fault events. */
> > + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
> > +
> > + do {
> > + cnt = riscv_iommu_queue_consume(queue, &idx);
> > + for (len = 0; len < cnt; idx++, len++)
> > + riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
> > + riscv_iommu_queue_release(queue, cnt);
> > + } while (cnt > 0);
> > +
> > + /* Clear MF/OF errors, complete error recovery to be implemented. */
> > + ctrl = riscv_iommu_readl(iommu, queue->qcr);
> > + if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
> > + riscv_iommu_writel(iommu, queue->qcr, ctrl);
> > + dev_warn(iommu->dev,
> > + "Queue #%u error; memory fault:%d overflow:%d\n",
> > + queue->qid,
> > + !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
> > + !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
> > + }
> > +
> > + return IRQ_HANDLED;
> > +}
> > +
> > /* Lookup and initialize device context info structure. */
> > static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
> > unsigned int devid)
> > @@ -250,6 +683,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> > struct device *dev = iommu->dev;
> > u64 ddtp, rq_ddtp;
> > unsigned int mode, rq_mode = ddtp_mode;
> > + struct riscv_iommu_command cmd;
> >
> > ddtp = riscv_iommu_read_ddtp(iommu);
> > if (ddtp & RISCV_IOMMU_DDTP_BUSY)
> > @@ -317,6 +751,18 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
> > if (mode != ddtp_mode)
> > dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
> >
> > + /* Invalidate device context cache */
> > + riscv_iommu_cmd_iodir_inval_ddt(&cmd);
> > + riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +
> > + /* Invalidate address translation cache */
> > + riscv_iommu_cmd_inval_vma(&cmd);
> > + riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +
> > + /* IOFENCE.C */
> > + riscv_iommu_cmd_iofence(&cmd);
> > + riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
> > +
> > return 0;
> > }
> >
> > @@ -492,6 +938,26 @@ static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
> > return -EINVAL;
> > }
> >
> > + /*
> > + * Distribute interrupt vectors, always use first vector for CIV.
> > + * At least one interrupt is required. Read back and verify.
> > + */
> > + if (!iommu->irqs_count)
> > + return -EINVAL;
> > +
> > + iommu->ivec = 0;
> > + iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_FIV, 1 % iommu->irqs_count);
> > + iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PIV, 2 % iommu->irqs_count);
> > + iommu->ivec |= FIELD_PREP(RISCV_IOMMU_IVEC_PMIV, 3 % iommu->irqs_count);
> > + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_IVEC, iommu->ivec);
> > +
> > + iommu->ivec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_IVEC);
> > + if (riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_CIV) >= RISCV_IOMMU_INTR_COUNT ||
> > + riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_FIV) >= RISCV_IOMMU_INTR_COUNT ||
> > + riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PIV) >= RISCV_IOMMU_INTR_COUNT ||
> > + riscv_iommu_queue_vec(iommu, RISCV_IOMMU_IVEC_PMIV) >= RISCV_IOMMU_INTR_COUNT)
> > + return -EINVAL;
> > +
> > return 0;
> > }
> >
> > @@ -500,12 +966,17 @@ void riscv_iommu_remove(struct riscv_iommu_device *iommu)
> > iommu_device_unregister(&iommu->iommu);
> > iommu_device_sysfs_remove(&iommu->iommu);
> > riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > + riscv_iommu_queue_disable(&iommu->cmdq);
> > + riscv_iommu_queue_disable(&iommu->fltq);
> > }
> >
> > int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > {
> > int rc;
> >
> > + RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
> > + RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
> > +
> > rc = riscv_iommu_init_check(iommu);
> > if (rc)
> > return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
> > @@ -514,10 +985,28 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > if (rc)
> > return rc;
> >
> > - rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > + rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
> > + sizeof(struct riscv_iommu_command));
> > + if (rc)
> > + return rc;
> > +
> > + rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
> > + sizeof(struct riscv_iommu_fq_record));
> > + if (rc)
> > + return rc;
> > +
> > + rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
> > if (rc)
> > return rc;
> >
> > + rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
> > + if (rc)
> > + goto err_queue_disable;
> > +
> > + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_MAX);
> > + if (rc)
> > + goto err_queue_disable;
> > +
> > rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
> > dev_name(iommu->dev));
> > if (rc) {
> > @@ -537,5 +1026,8 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
> > iommu_device_sysfs_remove(&iommu->iommu);
> > err_iodir_off:
> > riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_MODE_OFF);
> > +err_queue_disable:
> > + riscv_iommu_queue_disable(&iommu->fltq);
> > + riscv_iommu_queue_disable(&iommu->cmdq);
> > return rc;
> > }
> > diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
> > index f1696926582c..03e0c45bc7e1 100644
> > --- a/drivers/iommu/riscv/iommu.h
> > +++ b/drivers/iommu/riscv/iommu.h
> > @@ -17,6 +17,22 @@
> >
> > #include "iommu-bits.h"
> >
> > +struct riscv_iommu_device;
> > +
> > +struct riscv_iommu_queue {
> > + atomic_t prod; /* unbounded producer allocation index */
> > + atomic_t head; /* unbounded shadow ring buffer consumer index */
> > + atomic_t tail; /* unbounded shadow ring buffer producer index */
> > + unsigned int mask; /* index mask, queue length - 1 */
> > + unsigned int irq; /* allocated interrupt number */
> > + struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */
> > + void *base; /* ring buffer kernel pointer */
> > + dma_addr_t phys; /* ring buffer physical address */
> > + u16 qbr; /* base register offset, head and tail reference */
> > + u16 qcr; /* control and status register offset */
> > + u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */
> > +};
> > +
> > struct riscv_iommu_device {
> > /* iommu core interface */
> > struct iommu_device iommu;
> > @@ -34,6 +50,11 @@ struct riscv_iommu_device {
> > /* available interrupt numbers, MSI or WSI */
> > unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
> > unsigned int irqs_count;
> > + unsigned int ivec;
> > +
> > + /* hardware queues */
> > + struct riscv_iommu_queue cmdq;
> > + struct riscv_iommu_queue fltq;
> >
> > /* device directory */
> > unsigned int ddt_mode;
> > --
> > 2.34.1
> >

Best,
- Tomasz

> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@xxxxxxxxxxxxxxxxxxx
> > http://lists.infradead.org/mailman/listinfo/linux-riscv