Re: [KVM PATCH v3 1/3] KVM: fix race in irq_routing logic

From: Gleb Natapov
Date: Tue Oct 27 2009 - 02:45:55 EST


On Mon, Oct 26, 2009 at 12:21:57PM -0400, Gregory Haskins wrote:
> The current code suffers from the following race condition:
>
> thread-1 thread-2
> -----------------------------------------------------------
>
> kvm_set_irq() {
> rcu_read_lock()
> irq_rt = rcu_dereference(table);
> rcu_read_unlock();
>
> kvm_set_irq_routing() {
> mutex_lock();
> irq_rt = table;
> rcu_assign_pointer();
> mutex_unlock();
> synchronize_rcu();
>
> kfree(irq_rt);
>
> irq_rt->entry->set(); /* bad */
>
This is not what happens. irq_rt is never accessed outside read-side
critical section. Data is copied from irq_rt onto the stack and this
copy is accessed outside critical section.

> -------------------------------------------------------------
>
> Because the pointer is accessed outside of the read-side critical
> section. There are two basic patterns we can use to fix this bug:
>
> 1) Switch to sleeping-rcu and encompass the ->set() access within the
> read-side critical section,
>
> OR
>
> 2) Add reference counting to the irq_rt structure, and simply acquire
> the reference from within the RSCS.
>
> This patch implements solution (1).
>
> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
> ---
>
> include/linux/kvm_host.h | 6 +++++-
> virt/kvm/irq_comm.c | 50 +++++++++++++++++++++++++++-------------------
> virt/kvm/kvm_main.c | 1 +
> 3 files changed, 35 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index bd5a616..1fe135d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -185,7 +185,10 @@ struct kvm {
>
> struct mutex irq_lock;
> #ifdef CONFIG_HAVE_KVM_IRQCHIP
> - struct kvm_irq_routing_table *irq_routing;
> + struct {
> + struct srcu_struct srcu;
> + struct kvm_irq_routing_table *table;
> + } irq_routing;
> struct hlist_head mask_notifier_list;
> struct hlist_head irq_ack_notifier_list;
> #endif
> @@ -541,6 +544,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
> const struct kvm_irq_routing_entry *entries,
> unsigned nr,
> unsigned flags);
> +void kvm_init_irq_routing(struct kvm *kvm);
> void kvm_free_irq_routing(struct kvm *kvm);
>
> #else
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 00c68d2..db2553f 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -144,10 +144,11 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
> */
> int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> {
> - struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> - int ret = -1, i = 0;
> + struct kvm_kernel_irq_routing_entry *e;
> + int ret = -1;
> struct kvm_irq_routing_table *irq_rt;
> struct hlist_node *n;
> + int idx;
>
> trace_kvm_set_irq(irq, level, irq_source_id);
>
> @@ -155,21 +156,19 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> * IOAPIC. So set the bit in both. The guest will ignore
> * writes to the unused one.
> */
> - rcu_read_lock();
> - irq_rt = rcu_dereference(kvm->irq_routing);
> + idx = srcu_read_lock(&kvm->irq_routing.srcu);
> + irq_rt = rcu_dereference(kvm->irq_routing.table);
> if (irq < irq_rt->nr_rt_entries)
> - hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
> - irq_set[i++] = *e;
> - rcu_read_unlock();
> + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
> + int r;
>
> - while(i--) {
> - int r;
> - r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
> - if (r < 0)
> - continue;
> + r = e->set(e, kvm, irq_source_id, level);
> + if (r < 0)
> + continue;
>
> - ret = r + ((ret < 0) ? 0 : ret);
> - }
> + ret = r + ((ret < 0) ? 0 : ret);
> + }
> + srcu_read_unlock(&kvm->irq_routing.srcu, idx);
>
> return ret;
> }
> @@ -179,17 +178,18 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
> struct kvm_irq_ack_notifier *kian;
> struct hlist_node *n;
> int gsi;
> + int idx;
>
> trace_kvm_ack_irq(irqchip, pin);
>
> - rcu_read_lock();
> - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> + idx = srcu_read_lock(&kvm->irq_routing.srcu);
> + gsi = rcu_dereference(kvm->irq_routing.table)->chip[irqchip][pin];
> if (gsi != -1)
> hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
> link)
> if (kian->gsi == gsi)
> kian->irq_acked(kian);
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_routing.srcu, idx);
> }
>
> void kvm_register_irq_ack_notifier(struct kvm *kvm,
> @@ -287,11 +287,19 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
> rcu_read_unlock();
> }
>
> +void kvm_init_irq_routing(struct kvm *kvm)
> +{
> + init_srcu_struct(&kvm->irq_routing.srcu);
> +}
> +
> void kvm_free_irq_routing(struct kvm *kvm)
> {
> /* Called only during vm destruction. Nobody can use the pointer
> at this stage */
> - kfree(kvm->irq_routing);
> + synchronize_srcu(&kvm->irq_routing.srcu);
> + cleanup_srcu_struct(&kvm->irq_routing.srcu);
> +
> + kfree(kvm->irq_routing.table);
> }
>
> static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> @@ -396,10 +404,10 @@ int kvm_set_irq_routing(struct kvm *kvm,
> }
>
> mutex_lock(&kvm->irq_lock);
> - old = kvm->irq_routing;
> - rcu_assign_pointer(kvm->irq_routing, new);
> + old = kvm->irq_routing.table;
> + rcu_assign_pointer(kvm->irq_routing.table, new);
> mutex_unlock(&kvm->irq_lock);
> - synchronize_rcu();
> + synchronize_srcu(&kvm->irq_routing.srcu);
>
> new = old;
> r = 0;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index cac69c4..ba94159 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -388,6 +388,7 @@ static struct kvm *kvm_create_vm(void)
> atomic_inc(&kvm->mm->mm_count);
> spin_lock_init(&kvm->mmu_lock);
> spin_lock_init(&kvm->requests_lock);
> + kvm_init_irq_routing(kvm);
> kvm_io_bus_init(&kvm->pio_bus);
> kvm_eventfd_init(kvm);
> mutex_init(&kvm->lock);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/