[PATCH 8/9] locking/qspinlock: Limit CNA node scan after the lock is freed

From: Waiman Long
Date: Tue Jul 28 2020 - 15:13:47 EST


When there is a long list of lock waiters, the worst case scenario
for CNA is that none of them are from the same NUMA node. All the lock
waiters will have to be scanned to figure that out. If the lock becomes
free early in the scanning process, it means that there will be a long
delay between lock being freed and re-acquired after scanning the full
list. The worst case delay is O(n) where n is the number of lock waiters
which is limited by the number of cpus in the system.

One way to limit the delay is to limit the number of nodes to be scanned
after the lock is freed. Assuming random distribution of NUMA nodes in
the lock waiters, the chance of not finding a lock waiter in the same
NUMA node is ((n-1)/n)^m where n is the number of NUMA nodes and m is
the number of lock waiters scanned. If we limit the number of scans to
4n, for example, the chance of not finding same NUMA node lock waiter
will be 1.4% and 1.6% for 8 and 16-socket servers respectively.
Note that the limit applies only after the lock is freed, so the chance
of not finding the right lock waiter is even lower.

To make this possible, the lock waiter scanning and lock spinning have
to be done at the same time. Since the lock waiter scanning won't stop
until the lock is freed with additional look ahead, if necessary. The
extra scanning done in cna_lock_handoff() isn't needed anymore.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/locking/qspinlock_cna.h | 130 +++++++++++++++++++++------------
1 file changed, 83 insertions(+), 47 deletions(-)

diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index 03e8ba71a537..89a3905870ea 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -59,7 +59,7 @@ struct cna_node {
u16 numa_node;
u16 real_numa_node;
u32 encoded_tail; /* self */
- u32 partial_order; /* encoded tail or enum val */
+ u32 order; /* encoded tail or enum val */
s32 start_time;
};

@@ -79,6 +79,12 @@ enum {
((m) + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ)
static int intra_node_handoff_threshold __ro_after_init = MSECS_TO_JIFFIES(10);

+/*
+ * Controls how many lookaheads for matching numa node after the lock
+ * is freed.
+ */
+static int matching_node_lookahead __ro_after_init;
+
static inline bool intra_node_threshold_reached(struct cna_node *cn)
{
s32 current_time = (s32)jiffies;
@@ -105,6 +111,11 @@ static void __init cna_init_nodes_per_cpu(unsigned int cpu)
*/
WARN_ON(cn->encoded_tail < MIN_ENCODED_TAIL);
}
+ /*
+ * With 4x numa-node lookahead and assuming random node distribution,
+ * the probability of not finding a matching node is less than 2%.
+ */
+ matching_node_lookahead = 4*nr_node_ids;
}

static int __init cna_init_nodes(void)
@@ -262,33 +273,67 @@ static void cna_splice_tail(struct mcs_spinlock *node,
* of current waiters. However, the amortized complexity is close to O(1),
* as the immediate successor is likely to be running on the same node once
* threads from other nodes are moved to the secondary queue.
+ *
+ * Given the fact that the lock state is being monitored at the same time
+ * and the number of additional scans after the lock is freed is limited.
+ * This will limit the additional lock acquisition latency.
*/
-static u32 cna_order_queue(struct mcs_spinlock *node,
- struct mcs_spinlock *iter)
+static u32 cna_order_queue(struct mcs_spinlock *node, struct qspinlock *lock)
{
- struct cna_node *cni = (struct cna_node *)READ_ONCE(iter->next);
struct cna_node *cn = (struct cna_node *)node;
- int nid = cn->numa_node;
- struct cna_node *last;
+ struct cna_node *cni = cn;
+ struct cna_node *found = NULL;
+ int lookahead = matching_node_lookahead;
+ bool locked = true;

- /* find any next waiter on 'our' NUMA node */
- for (last = cn;
- /*
- * iterate as long as the current node is not priorizied and
- * does not run on 'our' NUMA node
- */
- cni && cni->numa_node != CNA_PRIORITY_NODE && cni->numa_node != nid;
- last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
- ;
+ /*
+ * Find any next waiter on 'our' NUMA node while spinning on the lock
+ * simultaneously. Iterate as long as the current node is not
+ * priorizied and does not run on 'our' NUMA node, or up to
+ * "lockahead" nodes have been checked after the lock was freed.
+ */
+ while (!found && lookahead) {
+ if (locked) {
+ u32 val = atomic_read(&lock->val);
+ if (!(val & _Q_LOCKED_PENDING_MASK)) {
+ locked = false;
+ continue;
+ }
+ /*
+ * If we are hitting the end of the queue, use
+ * atomic_cond_read_relaxed() to wait until the
+ * lock value changes which means either the lock
+ * is freed or a new waiter comes.
+ */
+ if (cni->encoded_tail == (val & _Q_TAIL_MASK))
+ atomic_cond_read_relaxed(&lock->val, VAL != val);
+ }
+ if (!found) {
+ found = (struct cna_node *)READ_ONCE(cni->mcs.next);
+ if (!found && !locked)
+ break; /* Lock is free with no more nodes to parse */
+
+ if (found && found->numa_node != CNA_PRIORITY_NODE
+ && found->numa_node != cn->numa_node) {
+ cni = found;
+ found = NULL;
+ }
+ if (!locked)
+ lookahead--;
+ }
+ }
+
+ if (!found)
+ return cni->encoded_tail; /* continue from here */

- if (!cni)
- return last->encoded_tail; /* continue from here */
+ if (found->numa_node == CNA_PRIORITY_NODE) {
+ int nid = cn->numa_node;

- if (cni->numa_node == CNA_PRIORITY_NODE)
- cni->numa_node = nid; /* Inherit node id of primary queue */
+ found->numa_node = nid; /* Inherit node id of primary queue */
+ }

- if (last != cn) /* did we skip any waiters? */
- cna_splice_tail(node, node->next, (struct mcs_spinlock *)last);
+ if (cni != cn) /* did we skip any waiters? */
+ cna_splice_tail(node, node->next, (struct mcs_spinlock *)cni);

/*
* We return LOCAL_WAITER_FOUND here even if we stopped the scan because
@@ -312,23 +357,23 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
if (!cn->start_time)
cn->start_time = (s32)jiffies;

- if (!intra_node_threshold_reached(cn)) {
- /*
- * We are at the head of the wait queue, no need to use
- * the fake NUMA node ID.
- */
- if (cn->numa_node == CNA_PRIORITY_NODE)
- cn->numa_node = cn->real_numa_node;
-
- /*
- * Try and put the time otherwise spent spin waiting on
- * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
- */
- cn->partial_order = cna_order_queue(node, node);
- } else {
- cn->partial_order = FLUSH_SECONDARY_QUEUE;
+ if (intra_node_threshold_reached(cn)) {
+ cn->order = FLUSH_SECONDARY_QUEUE;
+ goto out;
}
+ /*
+ * We are at the head of the wait queue, no need to use
+ * the fake NUMA node ID.
+ */
+ if (cn->numa_node == CNA_PRIORITY_NODE)
+ cn->numa_node = cn->real_numa_node;

+ /*
+ * Try and put the time otherwise spent spin waiting on
+ * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
+ */
+ cn->order = cna_order_queue(node, lock);
+out:
return 0; /* we lied; we didn't wait, go do so now */
}

@@ -337,18 +382,9 @@ static inline void cna_lock_handoff(struct mcs_spinlock *node,
{
struct cna_node *cn = (struct cna_node *)node;
u32 val = _Q_LOCKED_VAL;
- u32 partial_order = cn->partial_order;
-
- /*
- * check if a successor from the same numa node has not been found in
- * pre-scan, and if so, try to find it in post-scan starting from the
- * node where pre-scan stopped (stored in @pre_scan_result)
- */
- if (partial_order >= MIN_ENCODED_TAIL)
- partial_order =
- cna_order_queue(node, decode_tail(partial_order));
+ u32 order = cn->order;

- if (partial_order == LOCAL_WAITER_FOUND) {
+ if (order == LOCAL_WAITER_FOUND) {
/*
* We found a local waiter; reload @next in case we called
* cna_order_queue() above.
--
2.18.1


--------------370526D467942428E1F63AAA--