Re: [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects

From: Tvrtko Ursulin
Date: Tue Aug 30 2022 - 10:20:16 EST



On 17/08/2022 08:34, Thomas Hellström wrote:
On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
The i915_gem_object_trylock we had in the grab_vma() makes it return
false
when the vma->obj is already locked. In this case we'll skip this vma
during eviction, and eventually might be forced to return -ENOSPC
even
though we could've evicted this vma if we waited for the lock a bit.

To fix this, replace the i915_gem_object_trylock with
i915_gem_object_lock.
And because we have to worry about the potential deadlock now,
bubble-up
the error code, so it will be correctly handled by the WW mechanism.

This fixes the issue
https://gitlab.freedesktop.org/drm/intel/-/issues/6564

Fixes: 7e00897be8bf ("drm/i915: Add object locking to
i915_gem_evict_for_node and i915_gem_evict_something, v2.")
Signed-off-by: Sviatoslav Peleshko
<sviatoslav.peleshko@xxxxxxxxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
--
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
b/drivers/gpu/drm/i915/i915_gem_evict.c
index f025ee4fa526..9d43f213f68f 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
        return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
 }
-static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
*ww)
+static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
*ww)
 {
+       int ret = 0;
+
        /*
         * We add the extra refcount so the object doesn't drop to
zero until
         * after ungrab_vma(), this way trylock is always paired with
unlock.
         */
        if (i915_gem_object_get_rcu(vma->obj)) {
-               if (!i915_gem_object_trylock(vma->obj, ww)) {
+               ret = i915_gem_object_lock(vma->obj, ww);

Isn't the vm->mutex held here? If so, then this would be a lockdep
violation.

Hm.. could the trylock site use a comment exaplaining the reasoning? Ie. trylock not just skipping "busy" objects but truly unavoidable?

Regardless, is the analysis on the spot and are we working on fixing in somehow?

Regards,

Tvrtko


/Thomas




+               if (ret)
                        i915_gem_object_put(vma->obj);
-                       return false;
-               }
        } else {
                /* Dead objects don't need pins */
                atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
        }
-       return true;
+       return ret;
 }
-static void ungrab_vma(struct i915_vma *vma)
+static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
*ww)
 {
        if (dying_vma(vma))
                return;
-       i915_gem_object_unlock(vma->obj);
+       if (!ww)
+               i915_gem_object_unlock(vma->obj);
+
        i915_gem_object_put(vma->obj);
 }
-static bool
+static int
 mark_free(struct drm_mm_scan *scan,
          struct i915_gem_ww_ctx *ww,
          struct i915_vma *vma,
          unsigned int flags,
          struct list_head *unwind)
 {
+       int err;
+
        if (i915_vma_is_pinned(vma))
-               return false;
+               return -ENOSPC;
-       if (!grab_vma(vma, ww))
-               return false;
+       err = grab_vma(vma, ww);
+       if (err)
+               return err;
        list_add(&vma->evict_link, unwind);
-       return drm_mm_scan_add_block(scan, &vma->node);
+       if (!drm_mm_scan_add_block(scan, &vma->node))
+               return -ENOSPC;
+
+       return 0;
 }
 static bool defer_evict(struct i915_vma *vma)
@@ -150,6 +159,7 @@ i915_gem_evict_something(struct
i915_address_space *vm,
        enum drm_mm_insert_mode mode;
        struct i915_vma *active;
        int ret;
+       int err = 0;
        lockdep_assert_held(&vm->mutex);
        trace_i915_gem_evict(vm, min_size, alignment, flags);
@@ -210,17 +220,23 @@ i915_gem_evict_something(struct
i915_address_space *vm,
                        continue;
                }
-               if (mark_free(&scan, ww, vma, flags, &eviction_list))
+               err = mark_free(&scan, ww, vma, flags,
&eviction_list);
+               if (!err)
                        goto found;
+               if (err == -EDEADLK)
+                       break;
        }
        /* Nothing found, clean up and bail out! */
        list_for_each_entry_safe(vma, next, &eviction_list,
evict_link) {
                ret = drm_mm_scan_remove_block(&scan, &vma->node);
                BUG_ON(ret);
-               ungrab_vma(vma);
+               ungrab_vma(vma, ww);
        }
+       if (err == -EDEADLK)
+               return err;
+
        /*
         * Can we unpin some objects such as idle hw contents,
         * or pending flips? But since only the GGTT has global
entries
@@ -267,7 +283,7 @@ i915_gem_evict_something(struct
i915_address_space *vm,
                        __i915_vma_pin(vma);
                } else {
                        list_del(&vma->evict_link);
-                       ungrab_vma(vma);
+                       ungrab_vma(vma, ww);
                }
        }
@@ -277,17 +293,21 @@ i915_gem_evict_something(struct
i915_address_space *vm,
                __i915_vma_unpin(vma);
                if (ret == 0)
                        ret = __i915_vma_unbind(vma);
-               ungrab_vma(vma);
+               ungrab_vma(vma, ww);
        }
        while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
                vma = container_of(node, struct i915_vma, node);
                /* If we find any non-objects (!vma), we cannot evict
them */
-               if (vma->node.color != I915_COLOR_UNEVICTABLE &&
-                   grab_vma(vma, ww)) {
-                       ret = __i915_vma_unbind(vma);
-                       ungrab_vma(vma);
+               if (vma->node.color != I915_COLOR_UNEVICTABLE) {
+                       ret = grab_vma(vma, ww);
+                       if (!ret) {
+                               ret = __i915_vma_unbind(vma);
+                               ungrab_vma(vma, ww);
+                       } else if (ret != -EDEADLK) {
+                               ret = -ENOSPC;
+                       }
                } else {
                        ret = -ENOSPC;
                }
@@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
i915_address_space *vm,
                        break;
                }
-               if (!grab_vma(vma, ww)) {
-                       ret = -ENOSPC;
+               ret = grab_vma(vma, ww);
+               if (ret) {
+                       if (ret != -EDEADLK)
+                               ret = -ENOSPC;
+
                        break;
                }
@@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
i915_address_space *vm,
                if (ret == 0)
                        ret = __i915_vma_unbind(vma);
-               ungrab_vma(vma);
+               ungrab_vma(vma, ww);
        }
        return ret;