[PATCH 5/5] habanalabs: support hard-reset scheduling during soft-reset

From: Oded Gabbay
Date: Sun Dec 26 2021 - 07:43:23 EST


From: Ofir Bitton <obitton@xxxxxxxxx>

As hard-reset can be requested during soft-reset, driver must allow
it or else critical events received during soft-reset will be
ignored.

Signed-off-by: Ofir Bitton <obitton@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
drivers/misc/habanalabs/common/device.c | 31 +++++++++++++++++++--
drivers/misc/habanalabs/common/habanalabs.h | 3 ++
2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 84621ad765bc..733338ab6f1d 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -978,7 +978,7 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
- reset_upon_device_release = false;
+ reset_upon_device_release = false, schedule_hard_reset = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx;
int i, rc;
@@ -1031,6 +1031,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* Block future CS/VM/JOB completion operations */
spin_lock(&hdev->reset_info.lock);
if (hdev->reset_info.in_reset) {
+ /* We only allow scheduling of a hard reset during soft reset */
+ if (hard_reset && hdev->reset_info.is_in_soft_reset)
+ hdev->reset_info.hard_reset_schedule_flags = flags;
spin_unlock(&hdev->reset_info.lock);
return 0;
}
@@ -1193,7 +1196,6 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
* is required for the initialization itself
*/
hdev->disabled = false;
- hdev->reset_info.is_in_soft_reset = false;

rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
@@ -1243,7 +1245,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
}
}

- hdev->reset_info.in_reset = 0;
+ spin_lock(&hdev->reset_info.lock);
+ hdev->reset_info.is_in_soft_reset = false;
+
+ /* Schedule hard reset only if requested and if not already in hard reset.
+ * We keep 'in_reset' enabled, so no other reset can go in during the hard
+ * reset schedule
+ */
+ if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
+ schedule_hard_reset = true;
+ else
+ hdev->reset_info.in_reset = 0;
+
+ spin_unlock(&hdev->reset_info.lock);
+
hdev->reset_info.needs_reset = false;

dev_notice(hdev->dev, "Successfully finished resetting the device\n");
@@ -1261,6 +1276,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->reset_info.soft_reset_cnt++;
}

+ if (schedule_hard_reset) {
+ dev_info(hdev->dev, "Performing hard reset scheduled during soft reset\n");
+ flags = hdev->reset_info.hard_reset_schedule_flags;
+ hdev->reset_info.hard_reset_schedule_flags = 0;
+ hdev->disabled = true;
+ hard_reset = true;
+ handle_reset_trigger(hdev, flags);
+ goto again;
+ }
+
return 0;

out_err:
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 37a3a469b42f..cb710fd478b6 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2460,6 +2460,8 @@ struct last_error_session_info {
* @lock: lock to protect critical reset flows.
* @soft_reset_cnt: number of soft reset since the driver was loaded.
* @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @hard_reset_schedule_flags: hard reset is scheduled to after current soft reset,
+ * here we hold the hard reset flags.
* @in_reset: is device in reset flow.
* @is_in_soft_reset: Device is currently in soft reset process.
* @needs_reset: true if reset_on_lockup is false and device should be reset
@@ -2478,6 +2480,7 @@ struct hl_reset_info {
spinlock_t lock;
u32 soft_reset_cnt;
u32 hard_reset_cnt;
+ u32 hard_reset_schedule_flags;
u8 in_reset;
u8 is_in_soft_reset;
u8 needs_reset;
--
2.25.1