[PATCH 01/12] habanalabs: fix soft reset accounting

From: Oded Gabbay
Date: Sun Nov 28 2021 - 14:36:49 EST


Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.

We also shouldn't print soft-reset when doing the reset upon device
release.

Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
drivers/misc/habanalabs/common/device.c | 50 ++++++++++++-------------
1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 484e0446381e..2b208007c26f 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
- bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+ bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+ reset_upon_device_release = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i, rc;

if (!hdev->init_done) {
- dev_err(hdev->dev,
- "Can't reset before initialization is done\n");
+ dev_err(hdev->dev, "Can't reset before initialization is done\n");
return 0;
}

@@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return -EINVAL;
}

+ reset_upon_device_release = true;
+
goto do_reset;
}

@@ -1024,12 +1026,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

if (hard_reset)
dev_info(hdev->dev, "Going to reset device\n");
- else if (flags & HL_DRV_RESET_DEV_RELEASE)
- dev_info(hdev->dev,
- "Going to reset device after it was released by user\n");
+ else if (reset_upon_device_release)
+ dev_info(hdev->dev, "Going to reset device after release by user\n");
else
- dev_info(hdev->dev,
- "Going to reset compute engines of inference device\n");
+ dev_info(hdev->dev, "Going to reset engines of inference device\n");
}

again:
@@ -1174,16 +1174,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "failed to initialize the H/W after reset\n");
+ dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
goto out_err;
}

/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
- dev_err(hdev->dev,
- "device is not idle (mask 0x%llx_%llx) after reset\n",
+ dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
@@ -1192,23 +1190,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed to detect if device is alive after reset\n");
+ dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
goto out_err;
}

if (hard_reset) {
rc = device_late_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed late init after hard reset\n");
+ dev_err(hdev->dev, "Failed late init after hard reset\n");
goto out_err;
}

rc = hl_vm_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed to init memory module after hard reset\n");
+ dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
goto out_err;
}

@@ -1216,8 +1211,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
} else {
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed late init after soft reset\n");
+ if (reset_upon_device_release)
+ dev_err(hdev->dev,
+ "Failed late init in reset after device release\n");
+ else
+ dev_err(hdev->dev, "Failed late init after soft reset\n");
goto out_err;
}
}
@@ -1236,7 +1234,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
* the device will be operational although it shouldn't be
*/
hdev->asic_funcs->enable_events_from_fw(hdev);
- } else {
+ } else if (!reset_upon_device_release) {
hdev->soft_reset_cnt++;
}

@@ -1246,12 +1244,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->disabled = true;

if (hard_reset) {
- dev_err(hdev->dev,
- "Failed to reset! Device is NOT usable\n");
+ dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
hdev->hard_reset_cnt++;
+ } else if (reset_upon_device_release) {
+ dev_err(hdev->dev, "Failed to reset device after user release\n");
+ hard_reset = true;
+ goto again;
} else {
- dev_err(hdev->dev,
- "Failed to do soft-reset, trying hard reset\n");
+ dev_err(hdev->dev, "Failed to do soft-reset\n");
hdev->soft_reset_cnt++;
hard_reset = true;
goto again;
--
2.25.1