Re: [RFC PATCH 3/4] KVM: emulate: avoid per-byte copying in instruction fetches

From: Bandan Das
Date: Wed May 07 2014 - 00:36:38 EST


Paolo Bonzini <pbonzini@xxxxxxxxxx> writes:

> We do not need a memory copying loop anymore in insn_fetch; we
> can use a byte-aligned pointer to access instruction fields directly

Nice approach!

> from the fetch_cache. This eliminates 40-80 cycles (corresponding to
> a 5-7% improvement in performance) from each instruction.
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> arch/x86/kvm/emulate.c | 47 ++++++++++++++++++++++-------------------------
> 1 file changed, 22 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 886f9a88010f..245a2d0bfe68 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -706,7 +706,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
> * Prefetch the remaining bytes of the instruction without crossing page
> * boundary if they are not in fetch_cache yet.
> */
> -static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
> +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
> {
> struct fetch_cache *fc = &ctxt->fetch;
> int rc;
> @@ -738,42 +738,39 @@ static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
> return X86EMUL_CONTINUE;
> }
>
> -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
> - void *__dest, unsigned size)
> +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
> + unsigned size)
> {
> - int rc;
> - struct fetch_cache *fc = &ctxt->fetch;
> - u8 *dest = __dest;
> - u8 *src = &fc->data[ctxt->_eip - fc->start];
> -
> /* We have to be careful about overflow! */
> - if (unlikely(ctxt->_eip > fc->end - size)) {
> - rc != do_insn_fetch_bytes(ctxt, size);
> - if (rc != X86EMUL_CONTINNUE)
> - goto done;
> - }
> -
> - while (size--) {
> - *dest++ = *src++;
> - ctxt->_eip++;
> - continue;
> - }
> - return X86EMUL_CONTINUE;
> + if (unlikely(ctxt->_eip > ctxt->fetch.end - size))
> + return __do_insn_fetch_bytes(ctxt, size);
> + else
> + return X86EMUL_CONTINUE;
> }
>
> /* Fetch next part of the instruction being emulated. */
> #define insn_fetch(_type, _ctxt) \
> -({ unsigned long _x; \
> - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
> +({ _type _x; \
> + struct fetch_cache *_fc; \
> + \
> + rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
> if (rc != X86EMUL_CONTINUE) \
> goto done; \
> - (_type)_x; \
> + _fc = &ctxt->fetch; \
> + _x = *(_type __aligned(1) *) &_fc->data[ctxt->_eip - _fc->start]; \
For my own understanding, how does the __aligned help here ? Wouldn't
that result in unaligned accesses that will actually impact performance ?

> + ctxt->_eip += sizeof(_type); \
> + _x; \
> })
>
> #define insn_fetch_arr(_arr, _size, _ctxt) \
> -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
> +({ \
> + struct fetch_cache *_fc; \
> + rc = do_insn_fetch_bytes(_ctxt, _size); \
> if (rc != X86EMUL_CONTINUE) \
> goto done; \
> + _fc = &ctxt->fetch; \
> + memcpy(_arr, &_fc->data[ctxt->_eip - _fc->start], _size); \
> + ctxt->_eip += (_size); \
> })
>
> /*
> @@ -4282,7 +4279,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
> if (insn_len > 0)
> memcpy(ctxt->fetch.data, insn, insn_len);
> else {
> - rc = do_insn_fetch_bytes(ctxt, 1);
> + rc = __do_insn_fetch_bytes(ctxt, 1);
> if (rc != X86EMUL_CONTINUE)
> return rc;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/