Re: PATCH: Misrouted IRQ recovery, take 2

From: Alan Cox
Date: Mon Sep 06 2004 - 11:13:31 EST


On Mon, Sep 06, 2004 at 04:54:02PM +0100, David R wrote:
> Quoting Alan Cox <alan@xxxxxxxxxx>:
>
> I'm probably being stupid... but in misrouted_irq(), shouldn't the 'work' flag
> be reset for each irq, rather than initialised outside the main loop?
>

You are correct.

Ok try this


--- linux-2.6.8.1/arch/i386/kernel/irq.c 2004-08-14 11:54:48.000000000 +0100
+++ linux-2.6.8.1.ac/arch/i386/kernel/irq.c 2004-09-07 07:21:13.267562440 +0100
@@ -273,12 +273,103 @@ static int noirqdebug;
static int __init noirqdebug_setup(char *str)
{
noirqdebug = 1;
- printk("IRQ lockup detection disabled\n");
+ printk(KERN_INFO "IRQ lockup detection disabled\n");
return 1;
}

__setup("noirqdebug", noirqdebug_setup);

+static int irqfixup;
+
+static int __init irqfixup_setup(char *str)
+{
+ irqfixup = 1;
+ printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
+ printk(KERN_WARNING "This may impact system performance.\n");
+ return 1;
+}
+
+__setup("irqfixup", irqfixup_setup);
+
+static int __init irqpoll_setup(char *str)
+{
+ irqfixup = 2;
+ printk(KERN_WARNING "Misrouted IRQ fixup and polling support enabled.\n");
+ printk(KERN_WARNING "This may significantly impact system performance.\n");
+ return 1;
+}
+
+__setup("irqpoll", irqpoll_setup);
+
+/*
+ * Recovery handler for misrouted interrupts
+ */
+
+static asmlinkage int misrouted_irq(int irq, struct pt_regs *regs)
+{
+ int i;
+ irq_desc_t *desc;
+ int ok = 0;
+ for(i = 1; i < NR_IRQS; i++)
+ {
+ int work = 0; /* Did we do work for a real IRQ */
+ struct irqaction *action;
+ if(i == irq) /* Already tried */
+ continue;
+ desc = &irq_desc[i];
+ spin_lock(&desc->lock);
+ action = desc->action;
+ /* Already running on another processor */
+ if(desc->status & IRQ_INPROGRESS)
+ {
+ /* Already running: If it is shared get the other
+ CPU to go looking for our mystery interrupt too */
+ if(desc->action && (desc->action->flags & SA_SHIRQ))
+ desc->status |= IRQ_PENDING;
+ spin_unlock(&desc->lock);
+ continue;
+ }
+ /* Honour the normal IRQ locking */
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock(&desc->lock);
+ while(action)
+ {
+ /* Only shared IRQ handlers are safe to call */
+ if(action->flags & SA_SHIRQ)
+ {
+ if(action->handler(i, action->dev_id, regs) == IRQ_HANDLED)
+ ok = 1;
+ }
+ action = action->next;
+ }
+ local_irq_disable();
+ /* Now clean up the flags */
+ spin_lock(&desc->lock);
+ action = desc->action;
+
+ /* While we were looking for a fixup someone queued a real
+ IRQ clashing with our walk */
+
+ while((desc->status & IRQ_PENDING) && action)
+ {
+ /* Perform real IRQ processing for the IRQ we deferred */
+ work = 1;
+ spin_unlock(&desc->lock);
+ handle_IRQ_event(i, regs, action);
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+ /* If we did actual work for the real IRQ line we must
+ let the IRQ controller clean up too */
+ if(work)
+ desc->handler->end(i);
+ spin_unlock(&desc->lock);
+ }
+ /* So the caller can adjust the irq error counts */
+ return ok;
+}
+
/*
* If 99,900 of the previous 100,000 interrupts have not been handled then
* assume that the IRQ is stuck in some manner. Drop a diagnostic and try to
@@ -289,13 +380,69 @@ __setup("noirqdebug", noirqdebug_setup);
*
* Called under desc->lock
*/
-static void note_interrupt(int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void note_interrupt(int irq, irq_desc_t *desc, irqreturn_t action_ret, struct pt_regs *regs)
{
if (action_ret != IRQ_HANDLED) {
desc->irqs_unhandled++;
if (action_ret != IRQ_NONE)
report_bad_irq(irq, desc, action_ret);
}
+ if(unlikely(irqfixup)) /* Don't punish working computers */
+ {
+ if((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE)
+ {
+#ifdef CONFIG_4KSTACKS
+ u32 *isp;
+ union irq_ctx * curctx;
+ union irq_ctx * irqctx;
+ int ok;
+
+ curctx = (union irq_ctx *) current_thread_info();
+ irqctx = hardirq_ctx[smp_processor_id()];
+
+ spin_unlock(&desc->lock);
+
+ /*
+ * this is where we switch to the IRQ stack. However, if we are already using
+ * the IRQ stack (because we interrupted a hardirq handler) we can't do that
+ * and just have to keep using the current stack (which is the irq stack already
+ * after all)
+ */
+
+ if (curctx == irqctx)
+ ok = misrouted_irq(irq, regs);
+ else {
+ /* build the stack frame on the IRQ stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+ irqctx->tinfo.previous_esp = current_stack_pointer();
+
+ *--isp = (u32) regs;
+ *--isp = (u32) irq;
+
+ asm volatile(
+ " xchgl %%ebx,%%esp \n"
+ " call misrouted_irq \n"
+ " xchgl %%ebx,%%esp \n"
+ : "=a"(ok)
+ : "b"(isp)
+ : "memory", "cc", "edx", "ecx"
+ );
+ }
+ spin_lock(&desc->lock);
+ if (curctx != irqctx)
+ irqctx->tinfo.task = NULL;
+#else
+ spin_unlock(&desc->lock);
+
+ ok = misrouted_irq(irq, desc, regs);
+
+ spin_lock(&desc->lock);
+#endif
+ if(action_ret == IRQ_NONE)
+ desc->irqs_unhandled -= ok;
+ }
+ }

desc->irq_count++;
if (desc->irq_count < 100000)
@@ -532,7 +679,7 @@ asmlinkage unsigned int do_IRQ(struct pt
}
spin_lock(&desc->lock);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(irq, desc, action_ret, &regs);
if (curctx != irqctx)
irqctx->tinfo.task = NULL;
if (likely(!(desc->status & IRQ_PENDING)))
@@ -551,7 +698,7 @@ asmlinkage unsigned int do_IRQ(struct pt

spin_lock(&desc->lock);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(irq, desc, action_ret, &regs);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/