[PATCH 1/4] uapi: habanalabs: add signal/wait operations

From: Oded Gabbay
Date: Thu May 07 2020 - 10:37:18 EST


From: Omer Shpigelman <oshpigelman@xxxxxxxxx>

This is a pre-requisite to upstreaming GAUDI support.

Signal/wait operations are done by the user to perform sync between two
Primary Queues (PQs). The sync is done using the sync manager and it is
usually resolved inside the device, but sometimes it can be resolved in the
host, i.e. the user should be able to wait in the host until a signal has
been completed.

The mechanism to define signal and wait operations is done by the driver
because it needs atomicity and serialization, which is already done in the
driver when submitting work to the different queues.

To implement this feature, the driver "takes" a couple of h/w resources,
and this is reflected by the defines added to the uapi file.

The signal/wait operations are done via the existing CS IOCTL, and they use
the same data structure. There is a difference in the meaning of some of
the parameters, and for that we added unions to make the code more
readable.

Signed-off-by: Omer Shpigelman <oshpigelman@xxxxxxxxx>
Reviewed-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
Signed-off-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
---
drivers/misc/habanalabs/command_submission.c | 13 +++-
drivers/misc/habanalabs/habanalabs.h | 2 +
include/uapi/misc/habanalabs.h | 67 +++++++++++++++-----
3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 6680e183d881..f7d03a35e6a8 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -11,6 +11,8 @@
#include <linux/uaccess.h>
#include <linux/slab.h>

+#define HL_CS_FLAGS_SIG_WAIT (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT)
+
static void job_wq_completion(struct work_struct *work);
static long _hl_cs_wait_ioctl(struct hl_device *hdev,
struct hl_ctx *ctx, u64 timeout_us, u64 seq);
@@ -659,7 +661,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
union hl_cs_args *args = data;
struct hl_ctx *ctx = hpriv->ctx;
void __user *chunks_execute, *chunks_restore;
- u32 num_chunks_execute, num_chunks_restore;
+ u32 num_chunks_execute, num_chunks_restore, sig_wait_flags;
u64 cs_seq = ULONG_MAX;
int rc, do_ctx_switch;
bool need_soft_reset = false;
@@ -672,6 +674,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
goto out;
}

+ sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT;
+
+ if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) &&
+ (!hdev->supports_sync_stream))) {
+ dev_err(hdev->dev, "Sync stream CS is not supported\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute;
num_chunks_execute = args->in.num_chunks_execute;

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index b1dc6a22ba0d..7cd9a8d72451 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1347,6 +1347,7 @@ struct hl_device_idle_busy_ts {
* only to POWER9 machines.
* @cdev_sysfs_created: were char devices and sysfs nodes created.
* @stop_on_err: true if engines should stop on error.
+ * @supports_sync_stream: is sync stream supported.
*/
struct hl_device {
struct pci_dev *pdev;
@@ -1429,6 +1430,7 @@ struct hl_device {
u8 power9_64bit_dma_enable;
u8 cdev_sysfs_created;
u8 stop_on_err;
+ u8 supports_sync_stream;

/* Parameters for bring-up */
u8 mmu_enable;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 523e511e6cff..70dfccea7038 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2020 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
@@ -241,52 +241,87 @@ union hl_cb_args {
* compatibility
*/
struct hl_cs_chunk {
- /*
- * For external queue, this represents a Handle of CB on the Host
- * For internal queue, this represents an SRAM or DRAM address of the
- * internal CB
- */
- __u64 cb_handle;
+ union {
+ /* For external queue, this represents a Handle of CB on the
+ * Host.
+ * For internal queue in Goya, this represents an SRAM or
+ * a DRAM address of the internal CB. In Gaudi, this might also
+ * represent a mapped host address of the CB.
+ *
+ * A mapped host address is in the device address space, after
+ * a host address was mapped by the device MMU.
+ */
+ __u64 cb_handle;
+
+ /* Relevant only when HL_CS_FLAGS_WAIT is set.
+ * This holds address of array of u64 values that contain
+ * signal CS sequence numbers. The wait described by this job
+ * will listen on all those signals (wait event per signal)
+ */
+ __u64 signal_seq_arr;
+ };
+
/* Index of queue to put the CB on */
__u32 queue_index;
- /*
- * Size of command buffer with valid packets
- * Can be smaller then actual CB size
- */
- __u32 cb_size;
+
+ union {
+ /*
+ * Size of command buffer with valid packets
+ * Can be smaller then actual CB size
+ */
+ __u32 cb_size;
+
+ /* Relevant only when HL_CS_FLAGS_WAIT is set.
+ * Number of entries in signal_seq_arr
+ */
+ __u32 num_signal_seq_arr;
+ };
+
/* HL_CS_CHUNK_FLAGS_* */
__u32 cs_chunk_flags;
+
/* Align structure to 64 bytes */
__u32 pad[11];
};

+/* SIGNAL and WAIT flags are mutually exclusive */
#define HL_CS_FLAGS_FORCE_RESTORE 0x1
+#define HL_CS_FLAGS_SIGNAL 0x2
+#define HL_CS_FLAGS_WAIT 0x4

#define HL_CS_STATUS_SUCCESS 0

#define HL_MAX_JOBS_PER_CS 512

struct hl_cs_in {
+
/* this holds address of array of hl_cs_chunk for restore phase */
__u64 chunks_restore;
- /* this holds address of array of hl_cs_chunk for execution phase */
+
+ /* holds address of array of hl_cs_chunk for execution phase */
__u64 chunks_execute;
+
/* this holds address of array of hl_cs_chunk for store phase -
* Currently not in use
*/
__u64 chunks_store;
+
/* Number of chunks in restore phase array. Maximum number is
* HL_MAX_JOBS_PER_CS
*/
__u32 num_chunks_restore;
+
/* Number of chunks in execution array. Maximum number is
* HL_MAX_JOBS_PER_CS
*/
__u32 num_chunks_execute;
+
/* Number of chunks in restore phase array - Currently not in use */
__u32 num_chunks_store;
+
/* HL_CS_FLAGS_* */
__u32 cs_flags;
+
/* Context ID - Currently not in use */
__u32 ctx_id;
};
@@ -597,8 +632,8 @@ struct hl_debug_args {
* For jobs on external queues, the user needs to create command buffers
* through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on
* internal queues, the user needs to prepare a "command buffer" with packets
- * on either the SRAM or DRAM, and give the device address of that buffer to
- * the CS ioctl.
+ * on either the device SRAM/DRAM or the host, and give the device address of
+ * that buffer to the CS ioctl.
*
* This IOCTL is asynchronous in regard to the actual execution of the CS. This
* means it returns immediately after ALL the JOBS were enqueued on their
@@ -610,7 +645,7 @@ struct hl_debug_args {
* external JOBS have been completed. Note that if the CS has internal JOBS
* which can execute AFTER the external JOBS have finished, the driver might
* report that the CS has finished executing BEFORE the internal JOBS have
- * actually finish executing.
+ * actually finished executing.
*
* Even though the sequence number increments per CS, the user can NOT
* automatically assume that if CS with sequence number N finished, then CS
--
2.17.1