Re: [kvm-devel] [BUG] Oops with KVM-27

From: Luca Tettamanti
Date: Fri Jun 15 2007 - 17:49:33 EST


Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto:
> > After a bit of thinking: it's correct but removes an optimization;
> > furthermore it may miss other instructions that write to memory mapped
> > areas.
> > A more proper fix should be "force the writeback if dst.ptr is in some
> > kind of mmio area".
> >
>
> I think we can just disable the optimization, and (in a separate patch)
> add it back in emulator_write_phys(), where we know we're modifying
> memory and not hitting mmio.

Problem is that in emulator_write_phys() we've already lost track of the
previous (dst.orig_val) value. I moved up the decision in
cmpxchg_emulated; unfortunately this means that the non-locked write
path (write_emulated) can't do this optimization, unless I change its
signature to include the old value.

The first patch makes the writeback step uncoditional whenever we have a
destination operand (the mov check (d & Mov) may be superfluous, yes?).
The write-to-registry path still has the optimization that skips the
write if possible.


--- a/kernel/x86_emulate.c 2007-06-15 21:13:51.000000000 +0200
+++ b/kernel/x86_emulate.c 2007-06-15 22:12:15.000000000 +0200
@@ -1057,9 +1057,8 @@
}

writeback:
- if ((d & Mov) || (dst.orig_val != dst.val)) {
- switch (dst.type) {
- case OP_REG:
+ if ((d & Mov) || (d & DstMask) == DstMem || (d & DstMask) == DstReg ) {
+ if (dst.type == OP_REG && dst.orig_val != dst.val) {
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
switch (dst.bytes) {
case 1:
@@ -1075,8 +1074,10 @@
*dst.ptr = dst.val;
break;
}
- break;
- case OP_MEM:
+ } else if (dst.type == OP_MEM) {
+ /* Always dispatch the write, since it may hit a
+ * MMIO area.
+ */
if (lock_prefix)
rc = ops->cmpxchg_emulated((unsigned long)dst.
ptr, &dst.orig_val,
@@ -1088,8 +1089,6 @@
ctxt);
if (rc != 0)
goto done;
- default:
- break;
}
}


Next one: I've splitted emulator_write_phys into emulator_write_phys_mem
(for normal RAM) and emulator_write_phys_mmio (for the rest). The
cmpxchg code path is roughly:

if (!mapped)
page_fault();

page = gfn_to_page(...)
if (page) {
if (!memcmp(old, new))
return X86EMUL_CONTINUE;

write_mem();
retrun X86EMUL_CONTINUE;
}
/* for mmio always do the write */
write_mmio();

I'm a bit confused about this test, found in emulator_write_phys
(original code):

if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
return 0;

AFAICT is makes the emulator skip the write if the modified area spans
across two different (physical) pages. When this happens write_emulated
does a MMIO write. I'd expect the function to load the 2 pages and do the
memory write on both instead.
I've preserved this logic in the following patch, but I don't understand
why it's correct :|

--- a/kernel/kvm_main.c 2007-06-15 21:18:08.000000000 +0200
+++ b/kernel/kvm_main.c 2007-06-15 23:34:13.000000000 +0200
@@ -1125,26 +1125,39 @@
}
}

-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+static int emulator_write_phys_mem(struct kvm_vcpu *vcpu, gpa_t gpa,
+ struct page *page, const void *val,
+ int bytes)
{
- struct page *page;
void *virt;
- unsigned offset = offset_in_page(gpa);
+ unsigned int offset;
+
+ offset = offset_in_page(gpa);

if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
return 0;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return 0;
+
mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
virt = kmap_atomic(page, KM_USER0);
kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
memcpy(virt + offset_in_page(gpa), val, bytes);
kunmap_atomic(virt, KM_USER0);
+
return 1;
}

+static int emulator_write_phys_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
+
+ return 0;
+}
+
static int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
@@ -1152,20 +1165,18 @@
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+ struct page *page;

if (gpa == UNMAPPED_GVA) {
kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
return X86EMUL_PROPAGATE_FAULT;
}

- if (emulator_write_phys(vcpu, gpa, val, bytes))
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (page && emulator_write_phys_mem(vcpu, gpa, page, val, bytes))
return X86EMUL_CONTINUE;

- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
+ emulator_write_phys_mmio(vcpu, gpa, val, bytes);

return X86EMUL_CONTINUE;
}
@@ -1177,12 +1188,37 @@
struct x86_emulate_ctxt *ctxt)
{
static int reported;
+ struct kvm_vcpu *vcpu;
+ gpa_t gpa;
+ struct page *page;

if (!reported) {
reported = 1;
printk(KERN_WARNING "kvm: emulating exchange as write\n");
}
- return emulator_write_emulated(addr, new, bytes, ctxt);
+
+ vcpu = ctxt->vcpu;
+ gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (page) {
+ /* Regular memory */
+ if (!memcmp(old, new, bytes))
+ return X86EMUL_CONTINUE;
+
+ if (emulator_write_phys_mem(vcpu, gpa, page, new, bytes))
+ return X86EMUL_CONTINUE;
+ }
+
+ /* MMIO */
+ emulator_write_phys_mmio(vcpu, gpa, new, bytes);
+
+ return X86EMUL_CONTINUE;
}

static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)


Both patches apply to kvm-28 and have been run-time tested with 32bit
guest on 32bit host, with a VMX processor.

If patches look good I'll resubmit with proper changelog and
signed-off.

Luca
--
The trouble with computers is that they do what you tell them,
not what you want.
D. Cohen
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/