[RFC PATCH] pci hotplug: fix hotplug bug during kernel boot
From: fuqiang wang
Date: Mon Jun 30 2025 - 03:20:45 EST
Performing hotplug operations when the virtual machine is just started
may cause the virtual machine kernel to trigger a bug_on in x86
architecture, with the bug_on dmesg as follows:
------------[ cut here ]------------
kernel BUG at kernel/resource.c:792!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 1 UID: 0 PID: 215 Comm: kworker/u128:5 Not tainted 6.14.0-rc1+ #17
Hardware name: JD JCloud Iaas Jvirt, BIOS unknown 2/2/2022
Workqueue: kacpi_hotplug acpi_hotplug_work_fn
RIP: 0010:reallocate_resource+0x197/0x1d0
Code: 20 48 8b 44 24 28 48 89 43 28 48 8b 44 24 30 48 89 43 30 48 8b 44 24 38 48 89 43 38 e8 12 db ff ff 48 85 c0 0f 84 5d ff ff ff <0f> 0b 48 8b 74 24 08 48 3b 73 08 0f 82 1c ff ff ff 48 89 0b 48 89
RSP: 0000:ffffc900008479b0 EFLAGS: 00010282
RAX: ffff8881020c73b0 RBX: ffff8881021813b0 RCX: 000000000000343f
RDX: 0000000000003400 RSI: ffff8881021813b0 RDI: ffff8881020c73b0
RBP: 0000000000000000 R08: ffff8881021863e0 R09: 0000000000000040
R10: 0000000000000000 R11: 000000000000343f R12: ffff88810020d6f0
R13: ffffc90000847a20 R14: ffff88810020d6f0 R15: ffffffff82edb970
FS: 0000000000000000(0000) GS:ffff88842ee80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000003036001 CR4: 0000000000170ef0
Call Trace:
<TASK>
? die+0x32/0x80
? do_trap+0xd9/0x100
? reallocate_resource+0x197/0x1d0
? do_error_trap+0x65/0x80
? reallocate_resource+0x197/0x1d0
? exc_invalid_op+0x4c/0x60
? reallocate_resource+0x197/0x1d0
? asm_exc_invalid_op+0x16/0x20
? reallocate_resource+0x197/0x1d0
allocate_resource+0x57/0xd0
? __pfx_pcibios_align_resource+0x10/0x10
pci_bus_alloc_from_region+0x1df/0x240
? __pfx_pcibios_align_resource+0x10/0x10
? __pfx_pcibios_align_resource+0x10/0x10
? __pfx_pcibios_align_resource+0x10/0x10
pci_bus_alloc_resource+0x86/0xb0
? __pfx_pcibios_align_resource+0x10/0x10
_pci_assign_resource+0x9e/0x120
? __pfx_pcibios_align_resource+0x10/0x10
pci_assign_resource+0xae/0x290
assign_requested_resources_sorted+0x4a/0xb0
__assign_resources_sorted+0x491/0x4d0
? __dev_sort_resources+0x9b/0x2a0
__pci_bus_assign_resources+0x6f/0x1f0
enable_slot+0x25e/0x440
? pci_device_is_present+0x49/0x70
acpiphp_check_bridge.part.0+0x117/0x150
hotplug_event+0x13d/0x220
? __pfx_acpiphp_hotplug_notify+0x10/0x10
acpiphp_hotplug_notify+0x20/0x60
acpi_device_hotplug+0xae/0x240
acpi_hotplug_work_fn+0x1a/0x30
process_one_work+0x184/0x3a0
worker_thread+0x24d/0x360
? __pfx_worker_thread+0x10/0x10
kthread+0xed/0x220
? finish_task_switch.isra.0+0x88/0x2b0
? __pfx_kthread+0x10/0x10
ret_from_fork+0x30/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Modules linked in:
The cause of the issue is that the enable_slot process in hotplug
conflicts with the pcibios_init process during kernel initialization.
This leads to the situation where, in the enable_slot process,
__dev_sort_resources first links all the resources of the devices
downstream of the bridge into the head (since there is no parent).
Subsequently, in the pcibios_init process, pci_claim_resource allocates
the BIOS-assigned ranges for these devices.
hotplug CPU kernel init CPU
enable_slot
...
__dev_sort_resources
//link all resources behind the bus
//into head
pci_bios_init
...
pcibios_allocate_bus_resources
//alloc resource for all bus
//resources linked into head have
//sibling and parent
However, in the subsequent steps of enable_slot, certain resources may
be reallocated due to the x86 alignment rule -- "0x00, 0xff region
modulo 0x400" (see pcibios_align_resource). During this reallocation,
alignment can cause gaps, leading to allocation failures and resulting
in the resource reset. Additionally, since this resource has already
been linked into bus->resource[]->child during the kernel initialization
process, a strange resource range [0, 0] appears in this chain. This
causes subsequent devices to be allocated ranges that conflict with
other resources. For a detailed analysis, see [1]:
This patch will make the hotplug process wait for the pcibios_init
process in kernel initialization to complete. (However, I am not sure if
this modification is appropriate, so I would appreciate your advice.)
[1]: https://github.com/cai-fuqiang/md/blob/master/case/guestkernel_hotplug_BUG_ON/kernel_panic.md
Signed-off-by: fuqiang wang <fuqiang.wng@xxxxxxxxx>
---
arch/x86/pci/common.c | 16 ++++++++++++++++
drivers/acpi/scan.c | 6 ++++++
include/linux/pci.h | 1 +
3 files changed, 23 insertions(+)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201..06ff04ab2a26 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -37,6 +37,8 @@ unsigned long pirq_table_addr;
const struct pci_raw_ops *__read_mostly raw_pci_ops;
const struct pci_raw_ops *__read_mostly raw_pci_ext_ops;
+DECLARE_COMPLETION(pcibios_init_completion);
+
int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val)
{
@@ -498,6 +500,17 @@ void __init pcibios_set_cache_line_size(void)
}
}
+static DEFINE_STATIC_KEY_FALSE(pcibios_init_done);
+
+void arch_wait_pcibios_init_complete(void)
+{
+ if (static_branch_likely(&pcibios_init_done))
+ return;
+
+ wait_for_completion(&pcibios_init_completion);
+ static_branch_enable(&pcibios_init_done);
+}
+
int __init pcibios_init(void)
{
if (!raw_pci_ops && !raw_pci_ext_ops) {
@@ -510,6 +523,9 @@ int __init pcibios_init(void)
if (pci_bf_sort >= pci_force_bf)
pci_sort_breadthfirst();
+
+ complete(&pcibios_init_completion);
+
return 0;
}
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 9f4efa8f75a6..a66fbc262fb8 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -21,6 +21,7 @@
#include <linux/pgtable.h>
#include <linux/crc32.h>
#include <linux/dma-direct.h>
+#include <linux/pci.h>
#include "internal.h"
#include "sleep.h"
@@ -435,12 +436,17 @@ static int acpi_generic_hotplug_event(struct acpi_device *adev, u32 type)
return -EINVAL;
}
+void __weak arch_wait_pcibios_init_complete(void) {}
+
void acpi_device_hotplug(struct acpi_device *adev, u32 src)
{
u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE;
int error = -ENODEV;
lock_device_hotplug();
+
+ arch_wait_pcibios_init_complete();
+
mutex_lock(&acpi_scan_lock);
/*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 47b31ad724fa..8078b68a9b0f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2356,6 +2356,7 @@ static inline void pcibios_penalize_isa_irq(int irq, int active) {}
int pcibios_alloc_irq(struct pci_dev *dev);
void pcibios_free_irq(struct pci_dev *dev);
resource_size_t pcibios_default_alignment(void);
+void arch_wait_pcibios_init_complete(void);
#if !defined(HAVE_PCI_MMAP) && !defined(ARCH_GENERIC_PCI_MMAP_RESOURCE)
extern int pci_create_resource_files(struct pci_dev *dev);
--
2.47.0