[PATCH v2] x86/hweight: fix open-coded versions of 32bit and 64bit hweight calculations

From: Jiri Kosina
Date: Mon Aug 08 2016 - 12:12:23 EST


From: Jiri Kosina <jkosina@xxxxxxx>

The open-coded version of __sw_hweight32() and __sw_hweight64() are
broken; the variable <-> register mapping in the comments doesn't match
the registers being used, resulting in wrong calculations.

This has a potential to demonstrate itself by various syptoms on machines
where this is actually being used to compute the hweight (due to lack of
popcnt insn). On my core2 system, this demonstrates itself as

NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
unchecked MSR access error: WRMSR to 0xdf (tried to write 0x000000ff80000001) at rIP: 0xffffffff90004acct_set_period+0xdc/0x190)
ffff8e39f941b000 ffff8e39fbc0a440 000000000000001e ffff8e39fbc0a660
ffff8e39f9523b78 ffffffff90004bd0 ffff8e39f941b000 ffff8e39fbc0a760
ffff8e39fbc0a440 ffff8e39f9523bc0 ffffffff900053ef 00000000f951c2c0
Call Trace:
[<ffffffff90004bd0>] x86_pmu_start+0x50/0x110
[<ffffffff900053ef>] x86_pmu_enable+0x27f/0x2f0
[<ffffffff90175642>] perf_pmu_enable+0x22/0x30
[<ffffffff901766b1>] ctx_resched+0x51/0x60
[<ffffffff901768a0>] __perf_event_enable+0x1e0/0x240
[<ffffffff9016e5f9>] event_function+0xa9/0x180
[<ffffffff901766c0>] ? ctx_resched+0x60/0x60
[<ffffffff9016fcef>] remote_function+0x3f/0x50
[<ffffffff901012b6>] generic_exec_single+0xb6/0x140
[<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
[<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
[<ffffffff901013f7>] smp_call_function_single+0xb7/0x110
[<ffffffff9016e984>] cpu_function_call+0x34/0x40
[<ffffffff9016e550>] ? list_del_event+0x150/0x150
[<ffffffff9016ecda>] event_function_call+0x11a/0x120
[<ffffffff901766c0>] ? ctx_resched+0x60/0x60
[<ffffffff9016ed79>] _perf_event_enable+0x49/0x70
[<ffffffff901736ac>] perf_event_enable+0x1c/0x40
[<ffffffff9013cad2>] watchdog_enable+0x132/0x1d0
[<ffffffff90092440>] smpboot_thread_fn+0xe0/0x1d0
[<ffffffff90092360>] ? sort_range+0x30/0x30
[<ffffffff9008e7e2>] kthread+0xf2/0x110
[<ffffffff9069e611>] ? wait_for_completion+0xe1/0x110
[<ffffffff906a3b2f>] ret_from_fork+0x1f/0x40
[<ffffffff9008e6f0>] ? kthread_create_on_node+0x220/0x220

as FIXED_EVENT_CONSTRAINT() doesn't produce correct results, and followup
panic in x86_pmu_stop(). YMMV.

Fix this by rewriting the code so that it actually matches the math
lib/hweight.c. While at it, nuke the original comments altogether; the
"variable" names don't really match what's being used in the C-version of
the function, and I find them to be more misleading than helping.

This version of gcc:

gcc (SUSE Linux) 4.8.5

produces identical assembly code from lib/hweight.c (modulo frame pointer
maths).

Fixes: f5967101e9d ("x86/hweight: Get rid of the special calling convention")
Signed-off-by: Jiri Kosina <jkosina@xxxxxxx>
---

v1 -> v2: remove the comments altogether; put "approved by gcc" note into
changelog

arch/x86/lib/hweight.S | 69 +++++++++++++++++++++++++-------------------------
1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d7..e1cea44f 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -8,56 +8,55 @@
*/
ENTRY(__sw_hweight32)

-#ifdef CONFIG_X86_64
- movl %edi, %eax # w
-#endif
__ASM_SIZE(push,) %__ASM_REG(dx)
- movl %eax, %edx # w -> t
- shrl %edx # t >>= 1
- andl $0x55555555, %edx # t &= 0x55555555
- subl %edx, %eax # w -= t

- movl %eax, %edx # w -> t
- shrl $2, %eax # w_tmp >>= 2
- andl $0x33333333, %edx # t &= 0x33333333
- andl $0x33333333, %eax # w_tmp &= 0x33333333
- addl %edx, %eax # w = w_tmp + t
+ movl %edi, %edx
+
+ shrl %edx
+ andl $0x55555555, %edx
+ subl %edx, %edi
+ movl %edi, %eax
+
+ shrl $2, %edi
+ andl $0x33333333, %eax
+ andl $0x33333333, %edi
+ addl %eax, %edi

- movl %eax, %edx # w -> t
- shrl $4, %edx # t >>= 4
- addl %edx, %eax # w_tmp += t
- andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
- imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
- shrl $24, %eax # w = w_tmp >> 24
+ movl %edi, %eax
+ shrl $4, %eax
+ addl %edi, %eax
+ andl $0x0f0f0f0f, %eax
+ imull $0x01010101, %eax, %eax
+ shrl $24, %eax
__ASM_SIZE(pop,) %__ASM_REG(dx)
ret
ENDPROC(__sw_hweight32)

ENTRY(__sw_hweight64)
-#ifdef CONFIG_X86_64
+
pushq %rdx

- movq %rdi, %rdx # w -> t
+ movq %rdi, %rdx
movabsq $0x5555555555555555, %rax
- shrq %rdx # t >>= 1
- andq %rdx, %rax # t &= 0x5555555555555555
+ shrq %rdx
+ andq %rax, %rdx
+ subq %rdx, %rdi
movabsq $0x3333333333333333, %rdx
- subq %rax, %rdi # w -= t
-
- movq %rdi, %rax # w -> t
- shrq $2, %rdi # w_tmp >>= 2
- andq %rdx, %rax # t &= 0x3333333333333333
- andq %rdi, %rdx # w_tmp &= 0x3333333333333333
- addq %rdx, %rax # w = w_tmp + t

- movq %rax, %rdx # w -> t
- shrq $4, %rdx # t >>= 4
- addq %rdx, %rax # w_tmp += t
+ movq %rdi, %rax
+ shrq $2, %rdi
+ andq %rdx, %rax
+ andq %rdx, %rdi
movabsq $0x0f0f0f0f0f0f0f0f, %rdx
- andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
+ addq %rax, %rdi
+
+ movq %rdi, %rax
+ shrq $4, %rax
+ addq %rdi, %rax
+ andq %rdx, %rax
movabsq $0x0101010101010101, %rdx
- imulq %rdx, %rax # w_tmp *= 0x0101010101010101
- shrq $56, %rax # w = w_tmp >> 56
+ imulq %rdx, %rax
+ shrq $56, %rax

popq %rdx
ret
--
Jiri Kosina
SUSE Labs