Re: [PATCH v3 03/12] x86/irq: Remove bitfields in posted interrupt descriptor

From: Oliver Sang
Date: Wed May 01 2024 - 03:30:30 EST



hi, Jacob Pan,


On Tue, Apr 23, 2024 at 10:41:05AM -0700, Jacob Pan wrote:
> Mixture of bitfields and types is weird and really not intuitive, remove
> bitfields and use typed data exclusively. Bitfields often result in
> inferior machine code.
>
> Link: https://lore.kernel.org/all/20240404101735.402feec8@jacob-builder/T/#mf66e34a82a48f4d8e2926b5581eff59a122de53a
> Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx>
>
> ---
> v3:
> - Fix a bug where SN bit position was used as the mask, reported by
> Oliver Sang.

we tested this verion, and confirmed the issue gone

Tested-by: kernel test robot <oliver.sang@xxxxxxxxx>


below just for reference.

for previous version, we noticed a
"WARNING:at_arch/x86/kvm/vmx/posted_intr.c:#pi_enable_wakeup_handler[kvm_intel]"
in testcase: kernel-selftests

Call Trace like below:

[ 399.225452][ T8098] ------------[ cut here ]------------
[ 399.232475][ T8098] PI descriptor SN field set before blocking
[ 399.232514][ T8098] WARNING: CPU: 184 PID: 8098 at arch/x86/kvm/vmx/posted_intr.c:160 pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel]
[ 399.254685][ T8098] Modules linked in: openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 intel_rapl_msr intel_rapl_common btrfs
+blake2b_generic x86_pkg_temp_thermal xor intel_powerclamp zstd_compress coretemp raid6_pq libcrc32c kvm_intel kvm nvme crct10dif_pclmul crc32_pclmul
+nvme_core crc32c_intel t10_pi ghash_clmulni_intel ast sha512_ssse3 rapl intel_cstate dax_hmem crc64_rocksoft_generic drm_shmem_helper mei_me i2c_i801
+crc64_rocksoft mei drm_kms_helper i2c_smbus crc64 i2c_ismt wmi ipmi_ssif acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler acpi_pad acpi_power_meter joydev
+binfmt_misc loop fuse drm dm_mod ip_tables
[ 399.321529][ T8098] CPU: 184 PID: 8098 Comm: xapic_ipi_test Not tainted 6.9.0-rc1-00008-g037ccaed5dc5 #1
[ 399.333325][ T8098] RIP: 0010:pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel]
[ 399.342631][ T8098] Code: e8 a4 5a 7b c0 e9 b2 fc ff ff e8 5a 5c 7b c0 fb eb 93 bf f1 00 00 00 e8 dd 9c 29 c0 eb 82 48 c7 c7 e0 c6 ee c0 e8 0f 2f 37 c0
+<0f> 0b e9 e7 fe ff ff 4c 89 f7 e8 00 6a c8 c0 e9 cd fe ff ff 4c 89
[ 399.365742][ T8098] RSP: 0018:ffa000003a527780 EFLAGS: 00010086
[ 399.373626][ T8098] RAX: 0000000000000000 RBX: ff1100247c23c740 RCX: 0000000000000027
[ 399.383668][ T8098] RDX: 0000000000000027 RSI: 0000000000000004 RDI: ff11003fc3030c08
[ 399.393759][ T8098] RBP: ffa000003a527908 R08: 0000000000000001 R09: ffe21c07f8606181
[ 399.403834][ T8098] R10: ff11003fc3030c0b R11: 0000000000000000 R12: 1ff40000074a4ef4
[ 399.413901][ T8098] R13: 0000000000000000 R14: ff1100247c23e4a0 R15: 00000000000000b8
[ 399.423969][ T8098] FS: 00007f10de2006c0(0000) GS:ff11003fc3000000(0000) knlGS:0000000000000000
[ 399.435126][ T8098] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 399.443651][ T8098] CR2: 0000000000000000 CR3: 000000210b958004 CR4: 0000000000f73ef0
[ 399.453753][ T8098] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 399.463835][ T8098] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[ 399.473912][ T8098] PKRU: 55555554
[ 399.479005][ T8098] Call Trace:
[ 399.483812][ T8098] <TASK>
[ 399.488224][ T8098] ? __warn+0xcc/0x2d0
[ 399.493903][ T8098] ? pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel]
[ 399.502643][ T8098] ? report_bug+0x261/0x2c0
[ 399.508825][ T8098] ? handle_bug+0x3a/0x90
[ 399.514817][ T8098] ? exc_invalid_op+0x17/0x40
[ 399.521191][ T8098] ? asm_exc_invalid_op+0x1a/0x20
[ 399.527938][ T8098] ? pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel]
[ 399.536677][ T8098] ? __pfx_pi_enable_wakeup_handler+0x10/0x10 [kvm_intel]
[ 399.545747][ T8098] ? __pfx_lock_repin_lock+0x10/0x10
[ 399.552740][ T8098] ? newidle_balance+0xc85/0x1300
[ 399.559408][ T8098] ? __pfx_perf_event_context_sched_out+0x10/0x10
[ 399.567652][ T8098] ? lock_acquire+0x432/0x4e0
[ 399.573959][ T8098] ? vmx_get_rflags+0x26/0x2c0 [kvm_intel]
[ 399.581527][ T8098] vmx_vcpu_pi_put+0x1d3/0x230 [kvm_intel]
[ 399.589119][ T8098] vmx_vcpu_put+0x12/0x20 [kvm_intel]
[ 399.596205][ T8098] kvm_arch_vcpu_put+0x49e/0x7b0 [kvm]
[ 399.603463][ T8098] kvm_sched_out+0xb2/0xe0 [kvm]
[ 399.610100][ T8098] prepare_task_switch+0x321/0xc40
[ 399.616863][ T8098] ? lock_release+0x1bf/0x240
[ 399.623146][ T8098] __schedule+0x5a6/0x20b0
[ 399.629125][ T8098] ? __pfx___schedule+0x10/0x10
[ 399.635581][ T8098] ? __pfx_lock_acquire+0x10/0x10
[ 399.642228][ T8098] ? kvm_apic_has_interrupt+0x9c/0x160 [kvm]
[ 399.650034][ T8098] ? lock_acquire+0x432/0x4e0
[ 399.656267][ T8098] ? __pfx_lock_acquire+0x10/0x10
[ 399.662905][ T8098] schedule+0xe2/0x2a0
[ 399.668478][ T8098] kvm_vcpu_block+0xd1/0x1c0 [kvm]
[ 399.675308][ T8098] kvm_vcpu_halt+0xee/0x900 [kvm]
[ 399.682018][ T8098] vcpu_run+0x50a/0x9d0 [kvm]
[ 399.688346][ T8098] kvm_arch_vcpu_ioctl_run+0x377/0x1430 [kvm]
[ 399.696254][ T8098] ? lock_release+0xe5/0x240
[ 399.702428][ T8098] kvm_vcpu_ioctl+0x34c/0xc40 [kvm]
[ 399.709341][ T8098] ? __pfx_kvm_vcpu_ioctl+0x10/0x10 [kvm]
[ 399.716821][ T8098] ? __lock_release+0x103/0x440
[ 399.723960][ T8098] ? __fget_files+0x1c7/0x330
[ 399.730219][ T8098] ? __pfx___lock_release+0x10/0x10
[ 399.737750][ T8098] ? __fget_files+0x1cc/0x330
[ 399.744028][ T8098] ? __fget_files+0x1c7/0x330
[ 399.750289][ T8098] ? lock_release+0xe5/0x240
[ 399.756461][ T8098] ? __fget_files+0x1cc/0x330
[ 399.762719][ T8098] __x64_sys_ioctl+0x134/0x1b0
[ 399.769077][ T8098] do_syscall_64+0x93/0x170
[ 399.775146][ T8098] ? do_user_addr_fault+0x477/0xcb0
[ 399.781989][ T8098] ? lockdep_hardirqs_on_prepare+0x279/0x3e0
[ 399.789654][ T8098] entry_SYSCALL_64_after_hwframe+0x6c/0x74
[ 399.797192][ T8098] RIP: 0033:0x7f10de88bc5b
[ 399.803079][ T8098] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05
+<89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
[ 399.826105][ T8098] RSP: 002b:00007f10de1ff9c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 399.836565][ T8098] RAX: ffffffffffffffda RBX: 000000001a5418c0 RCX: 00007f10de88bc5b
[ 399.846521][ T8098] R


> - Add and use non-atomic helpers to manipulate SN bit
> - Use pi_test_sn() instead of open coding
> v2:
> - Replace bitfields, no more mix.
> ---
> arch/x86/include/asm/posted_intr.h | 21 ++++++++++++---------
> arch/x86/kvm/vmx/posted_intr.c | 4 ++--
> arch/x86/kvm/vmx/vmx.c | 2 +-
> 3 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
> index acf237b2882e..20e31891de15 100644
> --- a/arch/x86/include/asm/posted_intr.h
> +++ b/arch/x86/include/asm/posted_intr.h
> @@ -15,17 +15,9 @@ struct pi_desc {
> };
> union {
> struct {
> - /* bit 256 - Outstanding Notification */
> - u16 on : 1,
> - /* bit 257 - Suppress Notification */
> - sn : 1,
> - /* bit 271:258 - Reserved */
> - rsvd_1 : 14;
> - /* bit 279:272 - Notification Vector */
> + u16 notifications; /* Suppress and outstanding bits */
> u8 nv;
> - /* bit 287:280 - Reserved */
> u8 rsvd_2;
> - /* bit 319:288 - Notification Destination */
> u32 ndst;
> };
> u64 control;
> @@ -88,4 +80,15 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
> return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
> }
>
> +/* Non-atomic helpers */
> +static inline void __pi_set_sn(struct pi_desc *pi_desc)
> +{
> + pi_desc->notifications |= BIT(POSTED_INTR_SN);
> +}
> +
> +static inline void __pi_clear_sn(struct pi_desc *pi_desc)
> +{
> + pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
> +}
> +
> #endif /* _X86_POSTED_INTR_H */
> diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> index af662312fd07..ec08fa3caf43 100644
> --- a/arch/x86/kvm/vmx/posted_intr.c
> +++ b/arch/x86/kvm/vmx/posted_intr.c
> @@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
> * handle task migration (@cpu != vcpu->cpu).
> */
> new.ndst = dest;
> - new.sn = 0;
> + __pi_clear_sn(&new);
>
> /*
> * Restore the notification vector; in the blocking case, the
> @@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
> &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
> raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
>
> - WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
> + WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
>
> old.control = READ_ONCE(pi_desc->control);
> do {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index d94bb069bac9..f505745913c8 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4843,7 +4843,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
> * or POSTED_INTR_WAKEUP_VECTOR.
> */
> vmx->pi_desc.nv = POSTED_INTR_VECTOR;
> - vmx->pi_desc.sn = 1;
> + __pi_set_sn(&vmx->pi_desc);
> }
>
> static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> --
> 2.25.1
>