[PATCH v5 6/6] locking/pvqspinlock: Queue node adaptive spinning

From: Waiman Long
Date: Fri Aug 07 2015 - 23:19:59 EST


In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

Reducing the spinning threshold is found to improve performance in
an overcommitted VM guest, but decrease performance when there is
no overcommittment.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() earlier if all the following conditions are true:

1) the vCPU has not been halted before;
2) the previous vCPU is in the halted state;
3) there are a lot of pv_wait() for the current vCPU recently.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

Westmere Haswell
Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs
----- -------- -------- -------- --------
Before patch 3m03.2s 9m21.1s 2m08.9s 16m14.8s
After patch 3m04.1s 9m28.5s 2m09.5s 8m29.3s

This patch seemed to cause a tiny bit of performance degraduation
for 32 vCPUs. For 48 vCPUs, there wasn't much change for Westmere,
but a pretty big performance jump for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@xxxxxx>
---
kernel/locking/qspinlock.c | 5 +-
kernel/locking/qspinlock_paravirt.h | 111 +++++++++++++++++++++++++++++++++-
2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 94fdd27..da39d43 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -258,7 +258,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/

static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
@@ -415,7 +416,7 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);

- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
}

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 9996609..f03bd7a 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -31,6 +31,38 @@
#define PENDING_SPIN_THRESHOLD (SPIN_THRESHOLD >> 5)

/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will spin less if the following conditions are all true:
+ * 1) vCPU in the previous node is halted
+ * 2) it has not been halted before
+ * 3) there is a lot of pv_wait() in the curent vCPU recently
+ *
+ * The last condition is being monitored by the wait_hist field in the pv_node
+ * structure which tracks the history of pv_wait() relative to slowpath calls.
+ * Each pv_wait will increment this field by PV_WAITHIST_INC until it exceeds
+ * PV_WAITHIST_MAX. Each slowpath lock call will decrement it by 1 until it
+ * reaches PV_WAITHIST_MIN. If its value is higher than PV_WAITHIST_THRESHOLD,
+ * the vCPU will spin less. The reason for this adaptive spinning is to try
+ * to enable wait-early mode only on over-committed guest which helps
+ * performance. However, it shouldn't be enabled when the guest is not
+ * over-committed as it will hurt performance.
+ *
+ * With PV_WAITHIST_INC set to 4, each pv_wait() while not in wait-early mode
+ * will increment wait_hist by 3. Each slowpath call without pv_wait() will
+ * decrement wait_hist by 1. The threshold is set at about 3/4 of the range
+ * so that about 10 steps from the edges in either direction will reach the
+ * threshold. If, on average, more than 1/4 of all slowpath calls results in
+ * a pv_wait(), it should stay in the wait-early mode.
+ */
+#define PV_WAITHIST_MASK 0xff
+#define PV_WAITHIST_INC 4
+#define PV_WAITHIST_MIN 1
+#define PV_WAITHIST_MAX 40
+#define PV_WAITHIST_THRESHOLD 30
+#define PV_CAN_WAIT_EARLY(w) ((w)->wait_hist > PV_WAITHIST_THRESHOLD)
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
@@ -46,6 +78,8 @@ struct pv_node {

int cpu;
u8 state;
+ u8 wait_hist;
+ u8 wait_early;
};

/*
@@ -55,6 +89,7 @@ enum pv_qlock_stat {
pvstat_wait_head,
pvstat_wait_node,
pvstat_wait_again,
+ pvstat_wait_early,
pvstat_kick_wait,
pvstat_kick_unlock,
pvstat_kick_ahead,
@@ -76,6 +111,7 @@ static const char * const stat_fsnames[pvstat_num] = {
[pvstat_wait_head] = "wait_head_count",
[pvstat_wait_node] = "wait_node_count",
[pvstat_wait_again] = "wait_again_count",
+ [pvstat_wait_early] = "wait_early_count",
[pvstat_kick_wait] = "kick_wait_count",
[pvstat_kick_unlock] = "kick_unlock_count",
[pvstat_kick_ahead] = "kick_ahead_count",
@@ -322,6 +358,63 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}

/*
+ * Helper functions to maintain the wait_hist field.
+ */
+static inline void pv_inc_waithist(struct pv_node *node)
+{
+ /*
+ * pv_wait() in wait_early mode doesn't count as much as !wait_early
+ */
+ if (node->wait_hist < PV_WAITHIST_MAX)
+ node->wait_hist += node->wait_early ? 1 : PV_WAITHIST_INC;
+}
+
+static inline void pv_dec_waithist(struct pv_node *node)
+{
+ node->wait_early = 0;
+ if (node->wait_hist > PV_WAITHIST_MIN)
+ node->wait_hist--;
+}
+
+/*
+ * Return true if the current queue node vCPU should call pv_wait() now.
+ *
+ * The following conditions have to be true for this function to return true:
+ * 1) vCPU in the previous node is halted
+ * 2) it has not been halted before (waitcnt == 0)
+ * 3) it is time to perform the check ((loop & PV_WAITHIST_MASK) == 0)
+ * 4) PV_CAN_WAIT_EARLY() is true.
+ * 5) for the vCPU next to the queue head, it has to spin at least 1/2 of
+ * SPIN_THRESHOLD before waiting.
+ *
+ * In essence, this function causes the queue node vCPU to halt itself whenever
+ * the previous one has been halted. However, if it has been halted and kicked
+ * before, it is assumed to be near the queue head and is going to get the lock
+ * soon. So it won't go into the wait-early mode.
+ */
+static inline bool pv_wait_early(struct pv_node *node, struct pv_node *prev,
+ int waitcnt, int loop)
+{
+ bool wait_early;
+
+ if (waitcnt || ((loop & PV_WAITHIST_MASK) != 0))
+ return false;
+
+ wait_early = PV_CAN_WAIT_EARLY(node) &&
+ (READ_ONCE(prev->state) != vcpu_running);
+ if (wait_early) {
+ if ((SPIN_THRESHOLD - loop < SPIN_THRESHOLD/2 ) &&
+ READ_ONCE(prev->mcs.locked))
+ return false;
+
+ pvstat_inc(pvstat_wait_early);
+ node->wait_early = 1;
+ }
+
+ return wait_early;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
@@ -332,6 +425,7 @@ static void pv_init_node(struct mcs_spinlock *node)

pn->cpu = smp_processor_id();
pn->state = vcpu_running;
+ pn->wait_early = 0;
}

/*
@@ -407,9 +501,10 @@ gotlock:
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
int waitcnt = 0;
int loop;

@@ -417,6 +512,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pn, pp, waitcnt, loop))
+ break;
cpu_relax();
}

@@ -432,6 +529,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
smp_store_mb(pn->state, vcpu_halted);

if (!READ_ONCE(node->locked)) {
+ pv_inc_waithist(pn);
pvstat_inc(pvstat_wait_node);
if (waitcnt)
pvstat_inc(pvstat_wait_again);
@@ -449,12 +547,15 @@ static void pv_wait_node(struct mcs_spinlock *node)
break;
/*
* If the locked flag is still not set after wakeup, it is a
- * spurious wakeup and the vCPU should wait again. However,
- * there is a pretty high overhead for CPU halting and kicking.
+ * spurious wakeup unless it is in wait-early mode. As there
+ * is a pretty high overhead for CPU halting and kicking.
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
- pvstat_inc(pvstat_spurious);
+ if (!pn->wait_early)
+ pvstat_inc(pvstat_spurious);
+ else
+ pn->wait_early = 0; /* Reset wait-early mode */
}

/*
@@ -515,6 +616,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;

+ pv_dec_waithist(pn); /* Pre-decremnt the wait_hist field */
for (;; waitcnt++) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
@@ -549,6 +651,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
return;
}
}
+ pv_inc_waithist(pn);
pvstat_inc(pvstat_wait_head);
if (waitcnt)
pvstat_inc(pvstat_wait_again);
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/