[patch 14/47] genirq: Implement a sane sparse_irq allocator

From: Thomas Gleixner
Date: Thu Sep 30 2010 - 19:16:22 EST


The current sparse_irq allocator has several short comings due to
failures in the design or the lack of it:

- Requires iteration over the number of active irqs to find a free slot
(Some architectures have grown their own workarounds for this)
- Removal of entries is not possible
- Racy between create_irq_nr and destroy_irq (plugged by horrible
callbacks)
- Migration of active irq descriptors is not possible
- No bulk allocation of irq ranges
- Sprinkeled irq_desc references all over the place outside of kernel/irq/
(The previous chip functions series is addressing this issue)

Implement a sane allocator which fixes the above short comings (though
migration of active descriptors needs a full tree wide cleanup of the
direct and mostly unlocked access to irq_desc).

The new allocator still uses a radix_tree, but uses a bitmap for
keeping track of allocated irq numbers. That allows:

- Fast lookup of a free slot
- Allows the removal of descriptors
- Prevents the create/destroy race
- Bulk allocation of consecutive irq ranges
- Basic design is ready for migration of life descriptors after
further cleanups

The bitmap is also used in the SPARSE_IRQ=n case for lookup and
raceless (de)allocation of irq numbers. So it removes the requirement
for looping through the descriptor array to find slots.

Right now it uses sparse_irq_lock to protect the bitmap and the radix
tree, but after cleaning up all users we should be able convert that
to a mutex and to switch the radix_tree and decriptor allocations to
GFP_KERNEL.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/irq.h | 25 +++++
kernel/irq/irqdesc.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 243 insertions(+), 8 deletions(-)

Index: linux-2.6-tip/include/linux/irq.h
===================================================================
--- linux-2.6-tip.orig/include/linux/irq.h
+++ linux-2.6-tip/include/linux/irq.h
@@ -276,6 +276,31 @@ static inline struct irq_desc *move_irq_

extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);

+int irq_alloc_descs(unsigned int irq, unsigned int from, unsigned int cnt, int node);
+
+static inline int irq_alloc_desc(int node)
+{
+ return irq_alloc_descs(0, 0, 1, node);
+}
+
+static inline int
+irq_alloc_desc_at(unsigned int at, int node)
+{
+ return irq_alloc_descs(at, 0, 1, node);
+}
+
+static inline int
+irq_alloc_desc_from(unsigned int from, int node)
+{
+ return irq_alloc_descs(0, from, 1, node);
+}
+
+void irq_free_descs(unsigned int irq, unsigned int cnt);
+static inline void irq_free_desc(unsigned int irq)
+{
+ irq_free_descs(irq, 1);
+}
+
/*
* Pick up the arch-dependent methods:
*/
Index: linux-2.6-tip/kernel/irq/irqdesc.c
===================================================================
--- linux-2.6-tip.orig/kernel/irq/irqdesc.c
+++ linux-2.6-tip/kernel/irq/irqdesc.c
@@ -13,6 +13,7 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
+#include <linux/bitmap.h>

#include "internals.h"

@@ -33,9 +34,55 @@ static void __init init_irq_default_affi
}
#endif

+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+ if (!zalloc_cpumask_var_node(&desc->affinity, gfp, node))
+ return -ENOMEM;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+ free_cpumask_var(desc->affinity);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+ desc->node = node;
+ desc->irq_data.affinity = &desc->affinity;
+ cpumask_copy(desc->affinity, irq_default_affinity);
+}
+
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+#endif
+
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+ memset(&desc->irq_data, 0 , sizeof(desc->irq_data));
+ desc->irq = irq;
+ desc->irq_data.irq = irq;
+ desc->status = IRQ_DEFAULT_INIT_FLAGS;
+ desc->chip = &no_irq_chip;
+ desc->irq_data.chip = &no_irq_chip;
+ desc->handle_irq = handle_bad_irq;
+ desc->depth = 1;
+ desc->name = NULL;
+ memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ desc_smp_init(desc, node);
+}
+
int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);

+DEFINE_RAW_SPINLOCK(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+
#ifdef CONFIG_SPARSE_IRQ

static struct irq_desc irq_desc_init = {
@@ -90,14 +137,9 @@ static void init_one_irq_desc(int irq, s
arch_init_chip_data(desc, node);
}

-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-
static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);

-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
{
radix_tree_insert(&irq_desc_tree, irq, desc);
}
@@ -116,6 +158,93 @@ void replace_irq_desc(unsigned int irq,
radix_tree_replace_slot(ptr, desc);
}

+static void delete_irq_desc(unsigned int irq)
+{
+ radix_tree_delete(&irq_desc_tree, irq);
+}
+
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+ struct irq_desc *desc;
+ gfp_t gfp = GFP_KERNEL;
+
+ desc = kzalloc_node(sizeof(*desc), gfp, node);
+ if (!desc)
+ return NULL;
+ desc->kstat_irqs = kzalloc_node(sizeof(*desc->kstat_irqs), gfp, node);
+ if (!desc)
+ goto err_desc;
+
+ if (alloc_masks(desc, gfp, node))
+ goto err_kstat;
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+
+ desc_set_defaults(irq, desc, node);
+
+ desc_smp_init(desc, node);
+ return desc;
+
+err_kstat:
+ kfree(desc->kstat_irqs);
+err_desc:
+ kfree(desc);
+ return NULL;
+}
+
+static void free_desc(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+ delete_irq_desc(irq);
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+ free_masks(desc);
+ kfree(desc->kstat_irqs);
+ kfree(desc);
+}
+
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ desc = alloc_desc(start + i, node);
+ if (!desc)
+ goto err;
+ raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+ irq_insert_desc(start + i, desc);
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+ }
+ return start;
+
+err:
+ for (i--; i >= 0; i--)
+ free_desc(start + i);
+
+ raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+ bitmap_clear(allocated_irqs, start, cnt);
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+ return -ENOMEM;
+}
+
static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS_LEGACY-1] = {
.irq = -1,
@@ -162,7 +291,7 @@ int __init early_irq_init(void)
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
alloc_desc_masks(&desc[i], node, true);
init_desc_masks(&desc[i]);
- set_irq_desc(i, &desc[i]);
+ irq_insert_desc(i, &desc[i]);
}

return arch_early_irq_init();
@@ -199,7 +328,7 @@ struct irq_desc * __ref irq_to_desc_allo
}
init_one_irq_desc(irq, desc, node);

- set_irq_desc(irq, desc);
+ irq_insert_desc(irq, desc);

out_unlock:
raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
@@ -257,8 +386,89 @@ struct irq_desc *irq_to_desc_alloc_node(
{
return irq_to_desc(irq);
}
+
+static void free_desc(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+#ifdef CONFIG_SMP
+ desc_set_defaults(irq, desc, desc->node);
+#else
+ desc_set_defaults(irq, desc, 0);
+#endif
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+ return start;
+}
#endif /* !CONFIG_SPARSE_IRQ */

+/* Dynamic interrupt handling */
+
+/**
+ * irq_free_descs - free irq descriptors
+ * @from: Start of descriptor range
+ * @cnt: Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+ unsigned long flags;
+ int i;
+
+ if (from >= nr_irqs || (from + cnt) > nr_irqs)
+ return;
+
+ for (i = 0; i < cnt; i++)
+ free_desc(from + i);
+
+ raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+ bitmap_clear(allocated_irqs, from, cnt);
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+}
+
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq: Allocate for specific irq number if irq > 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate.
+ * @node: Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(unsigned int irq, unsigned int from, unsigned int cnt, int node)
+{
+ unsigned long flags;
+ int start, ret;
+
+ if (!cnt)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&sparse_irq_lock, flags);
+
+ start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+ ret = -EEXIST;
+ if (irq && start != irq)
+ goto err;
+
+ ret = -ENOMEM;
+ if (start >= nr_irqs)
+ goto err;
+
+ bitmap_set(allocated_irqs, start, cnt);
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+ return alloc_descs(start, cnt, node);
+
+err:
+ raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
+ return ret;
+}
+
+/* Statistics access */
void clear_kstat_irqs(struct irq_desc *desc)
{
memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/