Re: [PATCH v2 2/4] KVM: x86: move sev_lock/unlock_vcpus_for_migration to kvm_main.c

From: Paolo Bonzini
Date: Wed Apr 16 2025 - 13:48:35 EST


On 4/10/25 10:16, Peter Zijlstra wrote:
On Tue, Apr 08, 2025 at 09:41:34PM -0400, Maxim Levitsky wrote:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 69782df3617f..71c0d8c35b4b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1368,6 +1368,77 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
+
+/*
+ * Lock all VM vCPUs.
+ * Can be used nested (to lock vCPUS of two VMs for example)
+ */
+int kvm_lock_all_vcpus_nested(struct kvm *kvm, bool trylock, unsigned int role)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ lockdep_assert_held(&kvm->lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+
+ if (trylock && !mutex_trylock_nested(&vcpu->mutex, role))
+ goto out_unlock;
+ else if (!trylock && mutex_lock_killable_nested(&vcpu->mutex, role))
+ goto out_unlock;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (!i)
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = MAX_LOCK_DEPTH - 1;
+ else
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+#endif
+ }

This code is all sorts of terrible.

Per the lockdep_assert_held() above, you serialize all these locks by
holding that lock, this means you can be using the _nest_lock()
annotation.

Also, the original code didn't have this trylock nonsense, and the
Changelog doesn't mention this -- in fact the Changelog claims no
change, which is patently false.

Anyway, please write like:

kvm_for_each_vcpu(i, vcpu, kvm) {
if (mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock))
goto unlock;
}

return 0;

unlock:

kvm_for_each_vcpu(j, vcpu, kvm) {
if (j == i)
break;

mutex_unlock(&vcpu->mutex);
}
return -EINTR;

And yes, you'll have to add mutex_lock_killable_nest_lock(), but that
should be trivial.

If I understand correctly, that would be actually
_mutex_lock_killable_nest_lock() plus a wrapper macro. But yes,
that is easy so it sounds good.

For the ARM case, which is the actual buggy one (it was complaining
about too high a depth) it still needs mutex_trylock_nest_lock();
the nest_lock is needed to avoid bumping the depth on every
mutex_trylock().

It should be something like
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 2143d05116be..328f573cab6d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -174,6 +174,12 @@ do { \
_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
} while (0)
+#define mutex_trylock_nest_lock(lock, nest_lock) \
+do { \
+ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
+ _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map); \
+} while (0)
+
#else
extern void mutex_lock(struct mutex *lock);
extern int __must_check mutex_lock_interruptible(struct mutex *lock);
@@ -185,6 +191,7 @@ extern void mutex_lock_io(struct mutex *lock);
# define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
+# define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock)
#endif
/*
@@ -193,9 +200,14 @@ extern void mutex_lock_io(struct mutex *lock);
*
* Returns 1 if the mutex has been acquired successfully, and 0 on contention.
*/
-extern int mutex_trylock(struct mutex *lock);
+extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
extern void mutex_unlock(struct mutex *lock);
+static inline int mutex_trylock(struct mutex *lock)
+{
+ return _mutex_trylock_nest_lock(lock, NULL);
+}
+
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 555e2b3a665a..d5d1e79495fc 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1063,8 +1063,10 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
#endif
/**
- * mutex_trylock - try to acquire the mutex, without waiting
+ * _mutex_trylock_nest_lock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
+ * @nest_lock: if not NULL, a mutex that is always taken whenever multiple
+ * instances of @lock are
*
* Try to acquire the mutex atomically. Returns 1 if the mutex
* has been acquired successfully, and 0 on contention.
@@ -1076,7 +1078,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
* This function must not be used in interrupt context. The
* mutex must be released by the same task that acquired it.
*/
-int __sched mutex_trylock(struct mutex *lock)
+int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock)
{
bool locked;
@@ -1084,11 +1086,11 @@ int __sched mutex_trylock(struct mutex *lock)
locked = __mutex_trylock(lock);
if (locked)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_trylock_nest_lock);
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched

Does that seem sane?

Paolo