[PATCH] tick/broadcast: Reduce lock cacheline contention

From: Waiman Long
Date: Mon Jan 30 2017 - 12:59:58 EST


It was observed that on an Intel x86 system without the ARAT (Always
running APIC timer) feature and with fairly large number of CPUs as
well as CPUs coming in and out of intel_idle frequently, the lock
contention on the tick_broadcast_lock can become significant.

To reduce contention, the lock is put into its own cacheline and all
the cpumask_var_t variables are put into the __read_mostly section.

Running the SP benchmark of the NAS Parallel Benchmarks on a 4-socket
16-core 32-thread Nehalam system, the performance number improved
from 3353.94 Mop/s to 3469.31 Mop/s when this patch was applied on
a 4.9.6 kernel. This is a 3.4% improvement.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
include/linux/cpumask.h | 7 ++++++-
kernel/time/tick-broadcast.c | 15 ++++++++-------
2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c717f5e..23c1a6d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -649,11 +649,15 @@ static inline size_t cpumask_size(void)
* used. Please use this_cpu_cpumask_var_t in those cases. The direct use
* of this_cpu_ptr() or this_cpu_read() will lead to failures when the
* other type of cpumask_var_t implementation is configured.
+ *
+ * Please also note that __cpumask_var_read_mostly can be used to declare
+ * a cpumask_var_t variable itself (not its content) as read mostly.
*/
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;

-#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
+#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
+#define __cpumask_var_read_mostly __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
@@ -667,6 +671,7 @@ static inline size_t cpumask_size(void)
typedef struct cpumask cpumask_var_t[1];

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
+#define __cpumask_var_read_mostly

static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3109204..244c935 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -29,12 +29,13 @@
*/

static struct tick_device tick_broadcast_device;
-static cpumask_var_t tick_broadcast_mask;
-static cpumask_var_t tick_broadcast_on;
-static cpumask_var_t tmpmask;
-static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
+static cpumask_var_t tmpmask __cpumask_var_read_mostly;
static int tick_broadcast_forced;

+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -517,9 +518,9 @@ void tick_resume_broadcast(void)

#ifdef CONFIG_TICK_ONESHOT

-static cpumask_var_t tick_broadcast_oneshot_mask;
-static cpumask_var_t tick_broadcast_pending_mask;
-static cpumask_var_t tick_broadcast_force_mask;
+static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;

/*
* Exposed for debugging: see timer_list.c
--
1.8.3.1