Re: [2.6.17-rc5-mm2] crash when doing second suspend: BUG inarch/i386/kernel/nmi.c:174

From: Shaohua Li
Date: Tue Jun 06 2006 - 03:04:14 EST


On Mon, 2006-06-05 at 16:35 +0800, Miles Lane wrote:
> On 6/5/06, Andrew Morton <akpm@xxxxxxxx> wrote:
>
> > Do you think the suspend breakage is related to that patch?
> >
> > Miles also reports that every second suspend fails for him. Miles,
> does
> > 'nmi_watchdog=0' make it better?
>
> I tried using that as an appended boot option, but it didn't change
> the
> behavior.
Does below patch help? The nmi suspend/resume doesn't look good to me.
Only CPU0 uses the suspend/resume code path. Other CPUs run the CPU
hotplug code path.

Signed-off-by: Shaohua Li <shaohua.li@xxxxxxxxx>
---

linux-2.6.17-rc5-mm3-root/arch/i386/kernel/nmi.c | 14 +++++++++-----
linux-2.6.17-rc5-mm3-root/arch/i386/kernel/smpboot.c | 3 ++-
linux-2.6.17-rc5-mm3-root/arch/x86_64/kernel/nmi.c | 14 +++++++++-----
linux-2.6.17-rc5-mm3-root/arch/x86_64/kernel/smpboot.c | 2 ++
linux-2.6.17-rc5-mm3-root/include/asm-i386/nmi.h | 1 +
linux-2.6.17-rc5-mm3-root/include/asm-x86_64/nmi.h | 1 +
6 files changed, 24 insertions(+), 11 deletions(-)

diff -puN arch/i386/kernel/nmi.c~nmi arch/i386/kernel/nmi.c
--- linux-2.6.17-rc5-mm3/arch/i386/kernel/nmi.c~nmi 2006-06-04 15:22:06.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/arch/i386/kernel/nmi.c 2006-06-05 12:28:17.000000000 +0800
@@ -65,7 +65,6 @@ struct nmi_watchdog_ctlblk {
static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);

/* local prototypes */
-static void stop_apic_nmi_watchdog(void *unused);
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);

extern void show_registers(struct pt_regs *regs);
@@ -377,15 +376,20 @@ static int nmi_pm_active; /* nmi_active

static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
+ /* only CPU0 goes here, other CPUs should be offline */
nmi_pm_active = atomic_read(&nmi_active);
- disable_lapic_nmi_watchdog();
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}

static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}

@@ -798,7 +802,7 @@ void setup_apic_nmi_watchdog (void *unus
atomic_inc(&nmi_active);
}

-static void stop_apic_nmi_watchdog(void *unused)
+void stop_apic_nmi_watchdog(void *unused)
{
/* only support LOCAL and IO APICs for now */
if ((nmi_watchdog != NMI_LOCAL_APIC) &&
diff -puN arch/i386/kernel/smpboot.c~nmi arch/i386/kernel/smpboot.c
--- linux-2.6.17-rc5-mm3/arch/i386/kernel/smpboot.c~nmi 2006-06-04 15:26:47.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/arch/i386/kernel/smpboot.c 2006-06-05 08:51:16.000000000 +0800
@@ -1368,7 +1368,8 @@ int __cpu_disable(void)
*/
if (cpu == 0)
return -EBUSY;
-
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
diff -puN include/asm-i386/nmi.h~nmi include/asm-i386/nmi.h
--- linux-2.6.17-rc5-mm3/include/asm-i386/nmi.h~nmi 2006-06-05 08:50:23.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/include/asm-i386/nmi.h 2006-06-05 08:50:33.000000000 +0800
@@ -23,6 +23,7 @@ extern int reserve_evntsel_nmi(unsigned
extern void release_evntsel_nmi(unsigned int);

extern void setup_apic_nmi_watchdog (void *);
+extern void stop_apic_nmi_watchdog (void *);
extern void disable_timer_nmi_watchdog(void);
extern void enable_timer_nmi_watchdog(void);
extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
diff -puN arch/x86_64/kernel/nmi.c~nmi arch/x86_64/kernel/nmi.c
--- linux-2.6.17-rc5-mm3/arch/x86_64/kernel/nmi.c~nmi 2006-06-05 12:24:03.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/arch/x86_64/kernel/nmi.c 2006-06-05 12:26:00.000000000 +0800
@@ -65,7 +65,6 @@ struct nmi_watchdog_ctlblk {
static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);

/* local prototypes */
-static void stop_apic_nmi_watchdog(void *unused);
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);

/* converts an msr to an appropriate reservation bit */
@@ -363,15 +362,20 @@ static int nmi_pm_active; /* nmi_active

static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
+ /* only CPU0 goes here, other CPUs should be offline */
nmi_pm_active = atomic_read(&nmi_active);
- disable_lapic_nmi_watchdog();
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}

static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}

@@ -709,7 +713,7 @@ void setup_apic_nmi_watchdog(void *unuse
atomic_inc(&nmi_active);
}

-static void stop_apic_nmi_watchdog(void *unused)
+void stop_apic_nmi_watchdog(void *unused)
{
/* only support LOCAL and IO APICs for now */
if ((nmi_watchdog != NMI_LOCAL_APIC) &&
diff -puN include/asm-x86_64/nmi.h~nmi include/asm-x86_64/nmi.h
--- linux-2.6.17-rc5-mm3/include/asm-x86_64/nmi.h~nmi 2006-06-05 12:34:27.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/include/asm-x86_64/nmi.h 2006-06-05 12:34:41.000000000 +0800
@@ -54,6 +54,7 @@ extern int reserve_evntsel_nmi(unsigned
extern void release_evntsel_nmi(unsigned int);

extern void setup_apic_nmi_watchdog (void *);
+extern void stop_apic_nmi_watchdog (void *);
extern void disable_timer_nmi_watchdog(void);
extern void enable_timer_nmi_watchdog(void);
extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
diff -puN arch/x86_64/kernel/smpboot.c~nmi arch/x86_64/kernel/smpboot.c
--- linux-2.6.17-rc5-mm3/arch/x86_64/kernel/smpboot.c~nmi 2006-06-05 12:34:56.000000000 +0800
+++ linux-2.6.17-rc5-mm3-root/arch/x86_64/kernel/smpboot.c 2006-06-05 12:45:58.000000000 +0800
@@ -1232,6 +1232,8 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;

+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();

/*
_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/