Re: [PATCH 3/3] IPI: Avoid to use 2 cache lines for one call_single_data

From: Huang\, Ying
Date: Mon Aug 14 2017 - 01:47:39 EST


Hi, Peter,

"Huang, Ying" <ying.huang@xxxxxxxxx> writes:

> Peter Zijlstra <peterz@xxxxxxxxxxxxx> writes:
>
>> On Sat, Aug 05, 2017 at 08:47:02AM +0800, Huang, Ying wrote:
>>> Yes. That looks good. So you will prepare the final patch? Or you
>>> hope me to do that?
>>
>> I was hoping you'd do it ;-)
>
> Thanks! Here is the updated patch
>
> Best Regards,
> Huang, Ying
>
> ---------->8----------
> From 957735e9ff3922368286540dab852986fc7b23b5 Mon Sep 17 00:00:00 2001
> From: Huang Ying <ying.huang@xxxxxxxxx>
> Date: Mon, 7 Aug 2017 16:55:33 +0800
> Subject: [PATCH -v3] IPI: Avoid to use 2 cache lines for one
> call_single_data
>
> struct call_single_data is used in IPI to transfer information between
> CPUs. Its size is bigger than sizeof(unsigned long) and less than
> cache line size. Now, it is allocated with no explicit alignment
> requirement. This makes it possible for allocated call_single_data to
> cross 2 cache lines. So that double the number of the cache lines
> that need to be transferred among CPUs.
>
> This is resolved by requiring call_single_data to be aligned with the
> size of call_single_data. Now the size of call_single_data is the
> power of 2. If we add new fields to call_single_data, we may need to
> add pads to make sure the size of new definition is the power of 2.
> Fortunately, this is enforced by gcc, which will report error for not
> power of 2 alignment requirement.
>
> To set alignment requirement of call_single_data to the size of
> call_single_data, a struct definition and a typedef is used.
>
> To test the effect of the patch, we use the vm-scalability multiple
> thread swap test case (swap-w-seq-mt). The test will create multiple
> threads and each thread will eat memory until all RAM and part of swap
> is used, so that huge number of IPI will be triggered when unmapping
> memory. In the test, the throughput of memory writing improves ~5%
> compared with misaligned call_single_data because of faster IPI.

What do you think about this version?

Best Regards,
Huang, Ying

> [Add call_single_data_t and align with size of call_single_data]
> Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Juergen Gross <jgross@xxxxxxxx>
> Cc: Aaron Lu <aaron.lu@xxxxxxxxx>
> ---
> arch/mips/kernel/smp.c | 6 ++--
> block/blk-softirq.c | 2 +-
> drivers/block/null_blk.c | 2 +-
> drivers/cpuidle/coupled.c | 10 +++----
> drivers/net/ethernet/cavium/liquidio/lio_main.c | 2 +-
> drivers/net/ethernet/cavium/liquidio/octeon_droq.h | 2 +-
> include/linux/blkdev.h | 2 +-
> include/linux/netdevice.h | 2 +-
> include/linux/smp.h | 8 ++++--
> kernel/sched/sched.h | 2 +-
> kernel/smp.c | 32 ++++++++++++----------
> kernel/up.c | 2 +-
> 12 files changed, 39 insertions(+), 33 deletions(-)
>
> diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
> index 770d4d1516cb..bd8ba5472bca 100644
> --- a/arch/mips/kernel/smp.c
> +++ b/arch/mips/kernel/smp.c
> @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
> #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
>
> static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
> -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
> +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
>
> void tick_broadcast(const struct cpumask *mask)
> {
> atomic_t *count;
> - struct call_single_data *csd;
> + call_single_data_t *csd;
> int cpu;
>
> for_each_cpu(cpu, mask) {
> @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
>
> static int __init tick_broadcast_init(void)
> {
> - struct call_single_data *csd;
> + call_single_data_t *csd;
> int cpu;
>
> for (cpu = 0; cpu < NR_CPUS; cpu++) {
> diff --git a/block/blk-softirq.c b/block/blk-softirq.c
> index 87b7df4851bf..07125e7941f4 100644
> --- a/block/blk-softirq.c
> +++ b/block/blk-softirq.c
> @@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
> static int raise_blk_irq(int cpu, struct request *rq)
> {
> if (cpu_online(cpu)) {
> - struct call_single_data *data = &rq->csd;
> + call_single_data_t *data = &rq->csd;
>
> data->func = trigger_softirq;
> data->info = rq;
> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> index 85c24cace973..81142ce781da 100644
> --- a/drivers/block/null_blk.c
> +++ b/drivers/block/null_blk.c
> @@ -13,7 +13,7 @@
> struct nullb_cmd {
> struct list_head list;
> struct llist_node ll_list;
> - struct call_single_data csd;
> + call_single_data_t csd;
> struct request *rq;
> struct bio *bio;
> unsigned int tag;
> diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
> index 71e586d7df71..147f38ea0fcd 100644
> --- a/drivers/cpuidle/coupled.c
> +++ b/drivers/cpuidle/coupled.c
> @@ -119,13 +119,13 @@ struct cpuidle_coupled {
>
> #define CPUIDLE_COUPLED_NOT_IDLE (-1)
>
> -static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
> +static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
>
> /*
> * The cpuidle_coupled_poke_pending mask is used to avoid calling
> - * __smp_call_function_single with the per cpu call_single_data struct already
> + * __smp_call_function_single with the per cpu call_single_data_t struct already
> * in use. This prevents a deadlock where two cpus are waiting for each others
> - * call_single_data struct to be available
> + * call_single_data_t struct to be available
> */
> static cpumask_t cpuidle_coupled_poke_pending;
>
> @@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
> */
> static void cpuidle_coupled_poke(int cpu)
> {
> - struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
> + call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
>
> if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
> smp_call_function_single_async(cpu, csd);
> @@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
> {
> int cpu;
> struct cpuidle_device *other_dev;
> - struct call_single_data *csd;
> + call_single_data_t *csd;
> struct cpuidle_coupled *coupled;
>
> if (cpumask_empty(&dev->coupled_cpus))
> diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> index 51583ae4b1eb..120b6e537b28 100644
> --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
> +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> @@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
> if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
> napi_schedule_irqoff(&droq->napi);
> } else {
> - struct call_single_data *csd = &droq->csd;
> + call_single_data_t *csd = &droq->csd;
>
> csd->func = napi_schedule_wrapper;
> csd->info = &droq->napi;
> diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
> index 6efd139b894d..f91bc84d1719 100644
> --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
> +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
> @@ -328,7 +328,7 @@ struct octeon_droq {
>
> u32 cpu_id;
>
> - struct call_single_data csd;
> + call_single_data_t csd;
> };
>
> #define OCT_DROQ_SIZE (sizeof(struct octeon_droq))
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 25f6a0cb27d3..006fa09a641e 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
> struct request {
> struct list_head queuelist;
> union {
> - struct call_single_data csd;
> + call_single_data_t csd;
> u64 fifo_time;
> };
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 779b23595596..6557f320b66e 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2774,7 +2774,7 @@ struct softnet_data {
> unsigned int input_queue_head ____cacheline_aligned_in_smp;
>
> /* Elements below can be accessed between CPUs for RPS/RFS */
> - struct call_single_data csd ____cacheline_aligned_in_smp;
> + call_single_data_t csd ____cacheline_aligned_in_smp;
> struct softnet_data *rps_ipi_next;
> unsigned int cpu;
> unsigned int input_queue_tail;
> diff --git a/include/linux/smp.h b/include/linux/smp.h
> index 68123c1fe549..98b1fe027fc9 100644
> --- a/include/linux/smp.h
> +++ b/include/linux/smp.h
> @@ -14,13 +14,17 @@
> #include <linux/llist.h>
>
> typedef void (*smp_call_func_t)(void *info);
> -struct call_single_data {
> +struct __call_single_data {
> struct llist_node llist;
> smp_call_func_t func;
> void *info;
> unsigned int flags;
> };
>
> +/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
> +typedef struct __call_single_data call_single_data_t
> + __aligned(sizeof(struct __call_single_data));
> +
> /* total number of cpus in this system (may exceed NR_CPUS) */
> extern unsigned int total_cpus;
>
> @@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
> smp_call_func_t func, void *info, bool wait,
> gfp_t gfp_flags);
>
> -int smp_call_function_single_async(int cpu, struct call_single_data *csd);
> +int smp_call_function_single_async(int cpu, call_single_data_t *csd);
>
> #ifdef CONFIG_SMP
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index eeef1a3086d1..f29a7d2b57e1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -769,7 +769,7 @@ struct rq {
> #ifdef CONFIG_SCHED_HRTICK
> #ifdef CONFIG_SMP
> int hrtick_csd_pending;
> - struct call_single_data hrtick_csd;
> + call_single_data_t hrtick_csd;
> #endif
> struct hrtimer hrtick_timer;
> #endif
> diff --git a/kernel/smp.c b/kernel/smp.c
> index 3061483cb3ad..81cfca9b4cc3 100644
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -28,7 +28,7 @@ enum {
> };
>
> struct call_function_data {
> - struct call_single_data __percpu *csd;
> + call_single_data_t __percpu *csd;
> cpumask_var_t cpumask;
> cpumask_var_t cpumask_ipi;
> };
> @@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
> free_cpumask_var(cfd->cpumask);
> return -ENOMEM;
> }
> - cfd->csd = alloc_percpu(struct call_single_data);
> + cfd->csd = alloc_percpu(call_single_data_t);
> if (!cfd->csd) {
> free_cpumask_var(cfd->cpumask);
> free_cpumask_var(cfd->cpumask_ipi);
> @@ -103,12 +103,12 @@ void __init call_function_init(void)
> * previous function call. For multi-cpu calls its even more interesting
> * as we'll have to ensure no other cpu is observing our csd.
> */
> -static __always_inline void csd_lock_wait(struct call_single_data *csd)
> +static __always_inline void csd_lock_wait(call_single_data_t *csd)
> {
> smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
> }
>
> -static __always_inline void csd_lock(struct call_single_data *csd)
> +static __always_inline void csd_lock(call_single_data_t *csd)
> {
> csd_lock_wait(csd);
> csd->flags |= CSD_FLAG_LOCK;
> @@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
> /*
> * prevent CPU from reordering the above assignment
> * to ->flags with any subsequent assignments to other
> - * fields of the specified call_single_data structure:
> + * fields of the specified call_single_data_t structure:
> */
> smp_wmb();
> }
>
> -static __always_inline void csd_unlock(struct call_single_data *csd)
> +static __always_inline void csd_unlock(call_single_data_t *csd)
> {
> WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
>
> @@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
> smp_store_release(&csd->flags, 0);
> }
>
> -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
> +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
>
> /*
> - * Insert a previously allocated call_single_data element
> + * Insert a previously allocated call_single_data_t element
> * for execution on the given CPU. data must already have
> * ->func, ->info, and ->flags set.
> */
> -static int generic_exec_single(int cpu, struct call_single_data *csd,
> +static int generic_exec_single(int cpu, call_single_data_t *csd,
> smp_call_func_t func, void *info)
> {
> if (cpu == smp_processor_id()) {
> @@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
> {
> struct llist_head *head;
> struct llist_node *entry;
> - struct call_single_data *csd, *csd_next;
> + call_single_data_t *csd, *csd_next;
> static bool warned;
>
> WARN_ON(!irqs_disabled());
> @@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
> int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
> int wait)
> {
> - struct call_single_data *csd;
> - struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
> + call_single_data_t *csd;
> + call_single_data_t csd_stack = {
> + .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
> + };
> int this_cpu;
> int err;
>
> @@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
> * NOTE: Be careful, there is unfortunately no current debugging facility to
> * validate the correctness of this serialization.
> */
> -int smp_call_function_single_async(int cpu, struct call_single_data *csd)
> +int smp_call_function_single_async(int cpu, call_single_data_t *csd)
> {
> int err = 0;
>
> @@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
>
> cpumask_clear(cfd->cpumask_ipi);
> for_each_cpu(cpu, cfd->cpumask) {
> - struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
> + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
>
> csd_lock(csd);
> if (wait)
> @@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
>
> if (wait) {
> for_each_cpu(cpu, cfd->cpumask) {
> - struct call_single_data *csd;
> + call_single_data_t *csd;
>
> csd = per_cpu_ptr(cfd->csd, cpu);
> csd_lock_wait(csd);
> diff --git a/kernel/up.c b/kernel/up.c
> index ee81ac9af4ca..42c46bf3e0a5 100644
> --- a/kernel/up.c
> +++ b/kernel/up.c
> @@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
> }
> EXPORT_SYMBOL(smp_call_function_single);
>
> -int smp_call_function_single_async(int cpu, struct call_single_data *csd)
> +int smp_call_function_single_async(int cpu, call_single_data_t *csd)
> {
> unsigned long flags;