Re: [PATCH v1 6/8] x86/tsc: Use seqcount_latch_t

From: Ahmed S. Darwish
Date: Mon Sep 07 2020 - 12:29:04 EST


Hi,

On Fri, Sep 04, 2020 at 10:03:12AM +0200, peterz@xxxxxxxxxxxxx wrote:
> On Fri, Sep 04, 2020 at 09:41:42AM +0200, peterz@xxxxxxxxxxxxx wrote:
> > On Thu, Aug 27, 2020 at 01:40:42PM +0200, Ahmed S. Darwish wrote:
> >
> > > __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
> > > {
> > > + seqcount_latch_t *seqcount;
> > > int seq, idx;
> > >
> > > preempt_disable_notrace();
> > >
> > > + seqcount = &this_cpu_ptr(&cyc2ns)->seq;
> > > do {
> > > - seq = this_cpu_read(cyc2ns.seq.sequence);
> > > + seq = raw_read_seqcount_latch(seqcount);
> > > idx = seq & 1;
> > >
> > > data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
> > > data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
> > > data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
> > >
> > > - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
> > > + } while (read_seqcount_latch_retry(seqcount, seq));
> > > }
> >
> > So I worried about this change, it obviously generates worse code. But I
> > was not expecting this:
> >
> > Before:
> >
> > 196: 0000000000000110 189 FUNC GLOBAL DEFAULT 1 native_sched_clock
> >
> > After:
> >
> > 195: 0000000000000110 399 FUNC GLOBAL DEFAULT 1 native_sched_clock
> >
> > That's _210_ bytes extra!!
> >
> > If you look at the disassembly of the thing after it's a complete
> > trainwreck.
>
> The below delta fixes it again.
>
> ---
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -68,21 +68,19 @@ early_param("tsc_early_khz", tsc_early_k
>
> __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
> {
> - seqcount_latch_t *seqcount;
> int seq, idx;
>
> preempt_disable_notrace();
>
> - seqcount = &this_cpu_ptr(&cyc2ns)->seq;
> do {
> - seq = raw_read_seqcount_latch(seqcount);
> + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
> idx = seq & 1;
>
> data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
> data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
> data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
>
> - } while (read_seqcount_latch_retry(seqcount, seq));
> + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
> }
>
> __always_inline void cyc2ns_read_end(void)

I've been unsuccessful in reproducing this huge, 200+ bytes, difference.
Can I please get the defconfig and GCC version?

Here are the two competing implementations:

noinline void cyc2ns_read_begin_v1(struct cyc2ns_data *data)
{
seqcount_latch_t *seqcount;
int seq, idx;

preempt_disable_notrace();

seqcount = &this_cpu_ptr(&cyc2ns)->seq;
do {
seq = raw_read_seqcount_latch(seqcount);
idx = seq & 1;

data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

} while (read_seqcount_latch_retry(seqcount, seq));
}

noinline void cyc2ns_read_begin_v2(struct cyc2ns_data *data)
{
int seq, idx;

preempt_disable_notrace();

do {
seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
idx = seq & 1;

data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

} while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

And here are their resulting gcc-7/8/4.9 binary output (DEBUG_PREEMPT=y,
but setting DEBUG_KERNEL/DEBUG_PREEMPT=n also doesn't show any big _v1
vs. _v2 difference; similarly when using gcc-10):

gcc-7
=====

ffffffff81028890 <cyc2ns_read_begin_v1>:
ffffffff81028890: 65 ff 05 a9 e6 fe 7e incl %gs:0x7efee6a9(%rip) # 16f40 <__preempt_count>
ffffffff81028893: R_X86_64_PC32 __preempt_count-0x4
ffffffff81028897: 48 c7 c6 40 a5 1f 00 mov $0x1fa540,%rsi
ffffffff8102889a: R_X86_64_32S .data..percpu+0x1fa540
ffffffff8102889e: 48 89 f2 mov %rsi,%rdx
ffffffff810288a1: 65 48 03 15 bf 8a fe add %gs:0x7efe8abf(%rip),%rdx # 11368 <this_cpu_off>
ffffffff810288a8: 7e
ffffffff810288a5: R_X86_64_PC32 this_cpu_off-0x4
ffffffff810288a9: 8b 4a 20 mov 0x20(%rdx),%ecx
ffffffff810288ac: 89 c8 mov %ecx,%eax
ffffffff810288ae: 83 e0 01 and $0x1,%eax
ffffffff810288b1: 48 c1 e0 04 shl $0x4,%rax
ffffffff810288b5: 48 01 f0 add %rsi,%rax
ffffffff810288b8: 65 4c 8b 40 08 mov %gs:0x8(%rax),%r8
ffffffff810288bd: 4c 89 47 08 mov %r8,0x8(%rdi)
ffffffff810288c1: 65 44 8b 00 mov %gs:(%rax),%r8d
ffffffff810288c5: 44 89 07 mov %r8d,(%rdi)
ffffffff810288c8: 65 8b 40 04 mov %gs:0x4(%rax),%eax
ffffffff810288cc: 89 47 04 mov %eax,0x4(%rdi)
ffffffff810288cf: 8b 42 20 mov 0x20(%rdx),%eax
ffffffff810288d2: 39 c8 cmp %ecx,%eax
ffffffff810288d4: 75 d3 jne ffffffff810288a9 <cyc2ns_read_begin_v1+0x19>
ffffffff810288d6: f3 c3 repz retq
ffffffff810288d8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
ffffffff810288df: 00

ffffffff810288e0 <cyc2ns_read_begin_v2>:
ffffffff810288e0: 65 ff 05 59 e6 fe 7e incl %gs:0x7efee659(%rip) # 16f40 <__preempt_count>
ffffffff810288e3: R_X86_64_PC32 __preempt_count-0x4
ffffffff810288e7: 48 c7 c1 40 a5 1f 00 mov $0x1fa540,%rcx
ffffffff810288ea: R_X86_64_32S .data..percpu+0x1fa540
ffffffff810288ee: 48 89 c8 mov %rcx,%rax
ffffffff810288f1: 65 48 03 05 6f 8a fe add %gs:0x7efe8a6f(%rip),%rax # 11368 <this_cpu_off>
ffffffff810288f8: 7e
ffffffff810288f5: R_X86_64_PC32 this_cpu_off-0x4
ffffffff810288f9: 65 8b 15 60 1c 1d 7f mov %gs:0x7f1d1c60(%rip),%edx # 1fa560 <cyc2ns+0x20>
ffffffff810288fc: R_X86_64_PC32 .data..percpu+0x1fa55c
ffffffff81028900: 89 d0 mov %edx,%eax
ffffffff81028902: 83 e0 01 and $0x1,%eax
ffffffff81028905: 48 c1 e0 04 shl $0x4,%rax
ffffffff81028909: 48 01 c8 add %rcx,%rax
ffffffff8102890c: 65 48 8b 70 08 mov %gs:0x8(%rax),%rsi
ffffffff81028911: 48 89 77 08 mov %rsi,0x8(%rdi)
ffffffff81028915: 65 8b 30 mov %gs:(%rax),%esi
ffffffff81028918: 89 37 mov %esi,(%rdi)
ffffffff8102891a: 65 8b 40 04 mov %gs:0x4(%rax),%eax
ffffffff8102891e: 89 47 04 mov %eax,0x4(%rdi)
ffffffff81028921: 65 8b 05 38 1c 1d 7f mov %gs:0x7f1d1c38(%rip),%eax # 1fa560 <cyc2ns+0x20>
ffffffff81028924: R_X86_64_PC32 .data..percpu+0x1fa55c
ffffffff81028928: 39 c2 cmp %eax,%edx
ffffffff8102892a: 75 cd jne ffffffff810288f9 <cyc2ns_read_begin_v2+0x19>
ffffffff8102892c: f3 c3 repz retq
ffffffff8102892e: 66 90 xchg %ax,%ax

gcc-8
=====

ffffffff810281a0 <cyc2ns_read_begin_v1>:
ffffffff810281a0: 65 ff 05 99 ed fe 7e incl %gs:0x7efeed99(%rip) # 16f40 <__preempt_count>
ffffffff810281a3: R_X86_64_PC32 __preempt_count-0x4
ffffffff810281a7: 49 c7 c0 40 a5 1f 00 mov $0x1fa540,%r8
ffffffff810281aa: R_X86_64_32S .data..percpu+0x1fa540
ffffffff810281ae: 4c 89 c1 mov %r8,%rcx
ffffffff810281b1: 65 48 03 0d af 91 fe add %gs:0x7efe91af(%rip),%rcx # 11368 <this_cpu_off>
ffffffff810281b8: 7e
ffffffff810281b5: R_X86_64_PC32 this_cpu_off-0x4
ffffffff810281b9: 8b 51 20 mov 0x20(%rcx),%edx
ffffffff810281bc: 89 d0 mov %edx,%eax
ffffffff810281be: 83 e0 01 and $0x1,%eax
ffffffff810281c1: 48 c1 e0 04 shl $0x4,%rax
ffffffff810281c5: 4c 01 c0 add %r8,%rax
ffffffff810281c8: 65 48 8b 70 08 mov %gs:0x8(%rax),%rsi
ffffffff810281cd: 48 89 77 08 mov %rsi,0x8(%rdi)
ffffffff810281d1: 65 8b 30 mov %gs:(%rax),%esi
ffffffff810281d4: 89 37 mov %esi,(%rdi)
ffffffff810281d6: 65 8b 40 04 mov %gs:0x4(%rax),%eax
ffffffff810281da: 89 47 04 mov %eax,0x4(%rdi)
ffffffff810281dd: 8b 41 20 mov 0x20(%rcx),%eax
ffffffff810281e0: 39 d0 cmp %edx,%eax
ffffffff810281e2: 75 d5 jne ffffffff810281b9 <cyc2ns_read_begin_v1+0x19>
ffffffff810281e4: c3 retq
ffffffff810281e5: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
ffffffff810281ec: 00 00 00 00

ffffffff810281f0 <cyc2ns_read_begin_v2>:
ffffffff810281f0: 65 ff 05 49 ed fe 7e incl %gs:0x7efeed49(%rip) # 16f40 <__preempt_count>
ffffffff810281f3: R_X86_64_PC32 __preempt_count-0x4
ffffffff810281f7: 48 c7 c1 40 a5 1f 00 mov $0x1fa540,%rcx
ffffffff810281fa: R_X86_64_32S .data..percpu+0x1fa540
ffffffff810281fe: 48 89 c8 mov %rcx,%rax
ffffffff81028201: 65 48 03 05 5f 91 fe add %gs:0x7efe915f(%rip),%rax # 11368 <this_cpu_off>
ffffffff81028208: 7e
ffffffff81028205: R_X86_64_PC32 this_cpu_off-0x4
ffffffff81028209: 65 8b 15 50 23 1d 7f mov %gs:0x7f1d2350(%rip),%edx # 1fa560 <cyc2ns+0x20>
ffffffff8102820c: R_X86_64_PC32 .data..percpu+0x1fa55c
ffffffff81028210: 89 d0 mov %edx,%eax
ffffffff81028212: 83 e0 01 and $0x1,%eax
ffffffff81028215: 48 c1 e0 04 shl $0x4,%rax
ffffffff81028219: 48 01 c8 add %rcx,%rax
ffffffff8102821c: 65 48 8b 70 08 mov %gs:0x8(%rax),%rsi
ffffffff81028221: 48 89 77 08 mov %rsi,0x8(%rdi)
ffffffff81028225: 65 8b 30 mov %gs:(%rax),%esi
ffffffff81028228: 89 37 mov %esi,(%rdi)
ffffffff8102822a: 65 8b 40 04 mov %gs:0x4(%rax),%eax
ffffffff8102822e: 89 47 04 mov %eax,0x4(%rdi)
ffffffff81028231: 65 8b 05 28 23 1d 7f mov %gs:0x7f1d2328(%rip),%eax # 1fa560 <cyc2ns+0x20>
ffffffff81028234: R_X86_64_PC32 .data..percpu+0x1fa55c
ffffffff81028238: 39 c2 cmp %eax,%edx
ffffffff8102823a: 75 cd jne ffffffff81028209 <cyc2ns_read_begin_v2+0x19>
ffffffff8102823c: c3 retq
ffffffff8102823d: 0f 1f 00 nopl (%rax)

gcc-4.9
=======

ffffffff810ab900 <cyc2ns_read_begin_v1>:
ffffffff810ab900: 55 push %rbp
ffffffff810ab901: 48 89 fd mov %rdi,%rbp
ffffffff810ab904: 53 push %rbx
ffffffff810ab905: 65 ff 05 f4 b6 f6 7e incl %gs:0x7ef6b6f4(%rip) # 17000 <__preempt_count>
ffffffff810ab908: R_X86_64_PC32 __preempt_count-0x4
ffffffff810ab90c: e8 df da c9 00 callq ffffffff81d493f0 <debug_smp_processor_id>
ffffffff810ab90d: R_X86_64_PC32 debug_smp_processor_id-0x4
ffffffff810ab911: 89 c0 mov %eax,%eax
ffffffff810ab913: 48 c7 c3 00 ab 02 00 mov $0x2ab00,%rbx
ffffffff810ab916: R_X86_64_32S .data..percpu+0x2ab00
ffffffff810ab91a: 48 03 1c c5 80 87 65 add -0x7d9a7880(,%rax,8),%rbx
ffffffff810ab921: 82
ffffffff810ab91e: R_X86_64_32S __per_cpu_offset
ffffffff810ab922: 8b 53 20 mov 0x20(%rbx),%edx
ffffffff810ab925: 89 d0 mov %edx,%eax
ffffffff810ab927: 83 e0 01 and $0x1,%eax
ffffffff810ab92a: 48 c1 e0 04 shl $0x4,%rax
ffffffff810ab92e: 65 48 8b 88 08 ab 02 mov %gs:0x2ab08(%rax),%rcx
ffffffff810ab935: 00
ffffffff810ab932: R_X86_64_32S .data..percpu+0x2ab08
ffffffff810ab936: 48 89 4d 08 mov %rcx,0x8(%rbp)
ffffffff810ab93a: 65 8b 88 00 ab 02 00 mov %gs:0x2ab00(%rax),%ecx
ffffffff810ab93d: R_X86_64_32S .data..percpu+0x2ab00
ffffffff810ab941: 89 4d 00 mov %ecx,0x0(%rbp)
ffffffff810ab944: 65 8b 80 04 ab 02 00 mov %gs:0x2ab04(%rax),%eax
ffffffff810ab947: R_X86_64_32S .data..percpu+0x2ab04
ffffffff810ab94b: 89 45 04 mov %eax,0x4(%rbp)
ffffffff810ab94e: 8b 43 20 mov 0x20(%rbx),%eax
ffffffff810ab951: 39 c2 cmp %eax,%edx
ffffffff810ab953: 75 cd jne ffffffff810ab922 <cyc2ns_read_begin_v1+0x22>
ffffffff810ab955: 5b pop %rbx
ffffffff810ab956: 5d pop %rbp
ffffffff810ab957: c3 retq
ffffffff810ab958: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
ffffffff810ab95f: 00

ffffffff810ab960 <cyc2ns_read_begin_v2>:
ffffffff810ab960: 65 ff 05 99 b6 f6 7e incl %gs:0x7ef6b699(%rip) # 17000 <__preempt_count>
ffffffff810ab963: R_X86_64_PC32 __preempt_count-0x4
ffffffff810ab967: 65 8b 15 b2 f1 f7 7e mov %gs:0x7ef7f1b2(%rip),%edx # 2ab20 <cyc2ns+0x20>
ffffffff810ab96a: R_X86_64_PC32 .data..percpu+0x2ab1c
ffffffff810ab96e: 89 d0 mov %edx,%eax
ffffffff810ab970: 83 e0 01 and $0x1,%eax
ffffffff810ab973: 48 c1 e0 04 shl $0x4,%rax
ffffffff810ab977: 65 48 8b 88 08 ab 02 mov %gs:0x2ab08(%rax),%rcx
ffffffff810ab97e: 00
ffffffff810ab97b: R_X86_64_32S .data..percpu+0x2ab08
ffffffff810ab97f: 48 89 4f 08 mov %rcx,0x8(%rdi)
ffffffff810ab983: 65 8b 88 00 ab 02 00 mov %gs:0x2ab00(%rax),%ecx
ffffffff810ab986: R_X86_64_32S .data..percpu+0x2ab00
ffffffff810ab98a: 89 0f mov %ecx,(%rdi)
ffffffff810ab98c: 65 8b 80 04 ab 02 00 mov %gs:0x2ab04(%rax),%eax
ffffffff810ab98f: R_X86_64_32S .data..percpu+0x2ab04
ffffffff810ab993: 89 47 04 mov %eax,0x4(%rdi)
ffffffff810ab996: 65 8b 05 83 f1 f7 7e mov %gs:0x7ef7f183(%rip),%eax # 2ab20 <cyc2ns+0x20>
ffffffff810ab999: R_X86_64_PC32 .data..percpu+0x2ab1c
ffffffff810ab99d: 39 c2 cmp %eax,%edx
ffffffff810ab99f: 75 c6 jne ffffffff810ab967 <cyc2ns_read_begin_v2+0x7>
ffffffff810ab9a1: f3 c3 repz retq
ffffffff810ab9a3: 66 66 66 66 2e 0f 1f data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
ffffffff810ab9aa: 84 00 00 00 00 00

Thanks,

--
Ahmed S. Darwish
Linutronix GmbH