[PATCH 6/9] locking/qspinlock: Make CNA priority node inherit primary queue node ID

From: Waiman Long
Date: Tue Jul 28 2020 - 12:07:54 EST


When a lock waiter is not in a task context, has irq disabled or is a
RT task. It is considered at a higher priority and so will not be thrown
into the secondary queue to suffer additional lock acquisition latency.
However, if its node id is different from the primary queue node id,
we will have the situation that the primary queue node id is changed
mid-stream and the secondary queue may have nodes with the new node id
causing them to suffer undue delay.

One way to avoid this situation is to make the priority node inherit
the node id of the primary queue. That will require two numa node
fields in the CNA node - one for real numa node id and the other for
current numa node id. Since 16 bits are more than enough for node id,
we can split the current 32-bit field into two 16-bit ones without
affecting the structure size.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/locking/qspinlock_cna.h | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index b004ce6882b6..911c96279494 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -52,9 +52,12 @@
* Dave Dice <dave.dice@xxxxxxxxxx>
*/

+#define CNA_PRIORITY_NODE 0xffff
+
struct cna_node {
struct mcs_spinlock mcs;
- int numa_node; /* use LSB for priority */
+ u16 numa_node;
+ u16 real_numa_node;
u32 encoded_tail; /* self */
u32 partial_order; /* encoded tail or enum val */
u32 intra_count;
@@ -83,7 +86,7 @@ static void __init cna_init_nodes_per_cpu(unsigned int cpu)
for (i = 0; i < MAX_NODES; i++) {
struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i);

- cn->numa_node = numa_node << 1;
+ cn->real_numa_node = numa_node;
cn->encoded_tail = encode_tail(cpu, i);
/*
* make sure @encoded_tail is not confused with other valid
@@ -119,10 +122,10 @@ static __always_inline void cna_init_node(struct mcs_spinlock *node)
* be moved to the secondary queue.
*/
bool priority = !in_task() || irqs_disabled() || rt_task(current);
- ((struct cna_node *)node)->numa_node =
- (((struct cna_node *)node)->numa_node & ~1) | priority;
+ struct cna_node *cn = (struct cna_node *)node;

- ((struct cna_node *)node)->intra_count = 0;
+ cn->numa_node = priority ? CNA_PRIORITY_NODE : cn->real_numa_node;
+ cn->intra_count = 0;
}

/*
@@ -255,7 +258,7 @@ static u32 cna_order_queue(struct mcs_spinlock *node,
{
struct cna_node *cni = (struct cna_node *)READ_ONCE(iter->next);
struct cna_node *cn = (struct cna_node *)node;
- int nid = cn->numa_node >> 1;
+ int nid = cn->numa_node;
struct cna_node *last;

/* find any next waiter on 'our' NUMA node */
@@ -264,13 +267,16 @@ static u32 cna_order_queue(struct mcs_spinlock *node,
* iterate as long as the current node is not priorizied and
* does not run on 'our' NUMA node
*/
- cni && !(cni->numa_node & 0x1) && (cni->numa_node >> 1) != nid;
+ cni && cni->numa_node != CNA_PRIORITY_NODE && cni->numa_node != nid;
last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
;

if (!cni)
return last->encoded_tail; /* continue from here */

+ if (cni->numa_node == CNA_PRIORITY_NODE)
+ cni->numa_node = nid; /* Inherit node id of primary queue */
+
if (last != cn) /* did we skip any waiters? */
cna_splice_tail(node, node->next, (struct mcs_spinlock *)last);

@@ -290,6 +296,13 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
struct cna_node *cn = (struct cna_node *)node;

if (cn->intra_count < intra_node_handoff_threshold) {
+ /*
+ * We are at the head of the wait queue, no need to use
+ * the fake NUMA node ID.
+ */
+ if (cn->numa_node == CNA_PRIORITY_NODE)
+ cn->numa_node = cn->real_numa_node;
+
/*
* Try and put the time otherwise spent spin waiting on
* _Q_LOCKED_PENDING_MASK to use by sorting our lists.
--
2.18.1


--------------E1866B38CD0464E8AECD0ADC--