Re: x86 PMU broken in current Linus' tree

From: Jiri Kosina
Date: Mon Aug 08 2016 - 10:41:52 EST


On Mon, 8 Aug 2016, Peter Zijlstra wrote:

> So I can reproduce on my Lenovo T500 which has a Core2 as well. By long
> and tedious printk() it looks like the event constraint:
>
> FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */

[ adding Boris to CC ]

This made me wonder, and it turned out that HWEIGHT() doesn't seem to be
doing the right thing on machines that don't have popcnt instruction
(which is probably the distinguishing factor here).

How about the patch below? It fixes the symptoms on my system.




From: Jiri Kosina <jkosina@xxxxxxx>
Subject: [PATCH] x86/hweight: fix open-coded versions of 32bit and 64bit hweight calculations

The open-coded version of __sw_hweight32() and __sw_hweight64() are broken;
the variable <-> register mapping in the comments doesn't match the registers
being used, resulting in wrong calculations.

This has a potential to demonstrate itself by various syptoms on machines
where this is actually being used to compute the hweight (due to lack of
popcnt insn). On my core2 system, this demonstrates itself as

NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
unchecked MSR access error: WRMSR to 0xdf (tried to write 0x000000ff80000001) at rIP: 0xffffffff90004acc
t_set_period+0xdc/0x190)
ffff8e39f941b000 ffff8e39fbc0a440 000000000000001e ffff8e39fbc0a660
ffff8e39f9523b78 ffffffff90004bd0 ffff8e39f941b000 ffff8e39fbc0a760
ffff8e39fbc0a440 ffff8e39f9523bc0 ffffffff900053ef 00000000f951c2c0
Call Trace:
[<ffffffff90004bd0>] x86_pmu_start+0x50/0x110
[<ffffffff900053ef>] x86_pmu_enable+0x27f/0x2f0
[<ffffffff90175642>] perf_pmu_enable+0x22/0x30
[<ffffffff901766b1>] ctx_resched+0x51/0x60
[<ffffffff901768a0>] __perf_event_enable+0x1e0/0x240
[<ffffffff9016e5f9>] event_function+0xa9/0x180
[<ffffffff901766c0>] ? ctx_resched+0x60/0x60
[<ffffffff9016fcef>] remote_function+0x3f/0x50
[<ffffffff901012b6>] generic_exec_single+0xb6/0x140
[<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
[<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
[<ffffffff901013f7>] smp_call_function_single+0xb7/0x110
[<ffffffff9016e984>] cpu_function_call+0x34/0x40
[<ffffffff9016e550>] ? list_del_event+0x150/0x150
[<ffffffff9016ecda>] event_function_call+0x11a/0x120
[<ffffffff901766c0>] ? ctx_resched+0x60/0x60
[<ffffffff9016ed79>] _perf_event_enable+0x49/0x70
[<ffffffff901736ac>] perf_event_enable+0x1c/0x40
[<ffffffff9013cad2>] watchdog_enable+0x132/0x1d0
[<ffffffff90092440>] smpboot_thread_fn+0xe0/0x1d0
[<ffffffff90092360>] ? sort_range+0x30/0x30
[<ffffffff9008e7e2>] kthread+0xf2/0x110
[<ffffffff9069e611>] ? wait_for_completion+0xe1/0x110
[<ffffffff906a3b2f>] ret_from_fork+0x1f/0x40
[<ffffffff9008e6f0>] ? kthread_create_on_node+0x220/0x220

as FIXED_EVENT_CONSTRAINT() doesn't procude correct results, and followup panic
in x86_pmu_stop(). YMMV.

Fix this by rewriting the code so that it actually matches the math in
comments.

Fixes: f5967101e9d ("x86/hweight: Get rid of the special calling convention")
Signed-off-by: Jiri Kosina <jkosina@xxxxxxx>
---
arch/x86/lib/hweight.S | 43 +++++++++++++++++++++----------------------
1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d7..9d0540f 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -8,24 +8,23 @@
*/
ENTRY(__sw_hweight32)

+ __ASM_SIZE(push,) %__ASM_REG(dx)
#ifdef CONFIG_X86_64
- movl %edi, %eax # w
+ movl %edi, %edx # w -> t
#endif
- __ASM_SIZE(push,) %__ASM_REG(dx)
- movl %eax, %edx # w -> t
- shrl %edx # t >>= 1
- andl $0x55555555, %edx # t &= 0x55555555
- subl %edx, %eax # w -= t
+ shrl %edx # w >>= 1
+ andl $0x55555555, %edx # w &= 0x55555555
+ subl %edx, %edi # w -= t
+ movl %edi, %eax # w -> t

- movl %eax, %edx # w -> t
- shrl $2, %eax # w_tmp >>= 2
- andl $0x33333333, %edx # t &= 0x33333333
- andl $0x33333333, %eax # w_tmp &= 0x33333333
- addl %edx, %eax # w = w_tmp + t
+ shrl $2, %edi # w_tmp >>= 2
+ andl $0x33333333, %eax # t &= 0x33333333
+ andl $0x33333333, %edi # w_tmp &= 0x33333333
+ addl %eax, %edi # w = w_tmp + t

- movl %eax, %edx # w -> t
- shrl $4, %edx # t >>= 4
- addl %edx, %eax # w_tmp += t
+ movl %edi, %eax # w -> t
+ shrl $4, %eax # t >>= 4
+ addl %edi, %eax # w_tmp += t
andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
shrl $24, %eax # w = w_tmp >> 24
@@ -40,20 +39,20 @@ ENTRY(__sw_hweight64)
movq %rdi, %rdx # w -> t
movabsq $0x5555555555555555, %rax
shrq %rdx # t >>= 1
- andq %rdx, %rax # t &= 0x5555555555555555
+ andq %rax, %rdx # t &= 0x5555555555555555
+ subq %rdx, %rdi # w -= t
movabsq $0x3333333333333333, %rdx
- subq %rax, %rdi # w -= t

movq %rdi, %rax # w -> t
shrq $2, %rdi # w_tmp >>= 2
andq %rdx, %rax # t &= 0x3333333333333333
- andq %rdi, %rdx # w_tmp &= 0x3333333333333333
- addq %rdx, %rax # w = w_tmp + t
-
- movq %rax, %rdx # w -> t
- shrq $4, %rdx # t >>= 4
- addq %rdx, %rax # w_tmp += t
+ andq %rdx, %rdi # w_tmp &= 0x3333333333333333
movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+ addq %rax, %rdi # w = w_tmp + t
+
+ movq %rdi, %rax # w -> t
+ shrq $4, %rax # t >>= 4
+ addq %rdi, %rax # w_tmp += t
andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
movabsq $0x0101010101010101, %rdx
imulq %rdx, %rax # w_tmp *= 0x0101010101010101

--
Jiri Kosina
SUSE Labs