[PATCH 3/4] habanalabs: avoid soft lockup bug upon mapping error

From: Oded Gabbay
Date: Sun Mar 14 2021 - 09:06:12 EST


From: farah kassabri <fkassabri@xxxxxxxxx>

Add a little sleep between page unmappings in case mapping of
large number of host pages failed, in order to
avoid soft lockup bug during the rollback.

Signed-off-by: farah kassabri <fkassabri@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
drivers/misc/habanalabs/common/memory.c | 20 +++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 1b69573369cf..6530fddbbc21 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -857,6 +857,7 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
u32 page_size = phys_pg_pack->page_size;
int rc = 0;
+ bool is_host_addr;

for (i = 0 ; i < phys_pg_pack->npages ; i++) {
paddr = phys_pg_pack->pages[i];
@@ -878,6 +879,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
return 0;

err:
+ is_host_addr = !hl_is_dram_va(hdev, vaddr);
+
next_vaddr = vaddr;
for (i = 0 ; i < mapped_pg_cnt ; i++) {
if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
@@ -888,6 +891,17 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
phys_pg_pack->pages[i], page_size);

next_vaddr += page_size;
+
+ /*
+ * unmapping on Palladium can be really long, so avoid a CPU
+ * soft lockup bug by sleeping a little between unmapping pages
+ *
+ * In addition, on host num of pages could be huge,
+ * because page size could be 4KB, so when unmapping host
+ * pages sleep every 32K pages to avoid soft lockup
+ */
+ if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
+ usleep_range(50, 200);
}

return rc;
@@ -921,9 +935,9 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
* unmapping on Palladium can be really long, so avoid a CPU
* soft lockup bug by sleeping a little between unmapping pages
*
- * In addition, when unmapping host memory we pass through
- * the Linux kernel to unpin the pages and that takes a long
- * time. Therefore, sleep every 32K pages to avoid soft lockup
+ * In addition, on host num of pages could be huge,
+ * because page size could be 4KB, so when unmapping host
+ * pages sleep every 32K pages to avoid soft lockup
*/
if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
usleep_range(50, 200);
--
2.25.1