Re: [PATCH v19 007/130] x86/virt/tdx: Export SEAMCALL functions

From: Sean Christopherson
Date: Thu Apr 18 2024 - 14:26:35 EST


On Thu, Apr 18, 2024, kirill.shutemov@xxxxxxxxxxxxxxx wrote:
> On Tue, Apr 16, 2024 at 07:45:18PM +0000, Edgecombe, Rick P wrote:
> > On Wed, 2024-04-10 at 15:49 +0300, Kirill A. Shutemov wrote:
> > > On Fri, Mar 15, 2024 at 09:33:20AM -0700, Sean Christopherson wrote:
> > > > So my feedback is to not worry about the exports, and instead focus on
> > > > figuring
> > > > out a way to make the generated code less bloated and easier to read/debug.
> > >
> > > I think it was mistake trying to centralize TDCALL/SEAMCALL calls into
> > > few megawrappers. I think we can get better results by shifting leaf
> > > function wrappers into assembly.
> > >
> > > We are going to have more assembly, but it should produce better result.
> > > Adding macros can help to write such wrapper and minimizer boilerplate.
> > >
> > > Below is an example of how it can look like. It's not complete. I only
> > > converted TDCALLs, but TDVMCALLs or SEAMCALLs. TDVMCALLs are going to be
> > > more complex.
> > >
> > > Any opinions? Is it something worth investing more time?
> >
> > We discussed offline how implementing these for each TDVM/SEAMCALL increases the
> > chances of a bug in just one TDVM/SEAMCALL. Which could making debugging
> > problems more challenging. Kirill raised the possibility of some code generating
> > solution like cpufeatures.h, that could take a spec and generate correct calls.
> >
> > So far no big wins have presented themselves. Kirill, do we think the path to
> > move the messy part out-of-line will not work?
>
> I converted all TDCALL and TDVMCALL leafs to direct assembly wrappers.
> Here's WIP branch: https://github.com/intel/tdx/commits/guest-tdx-asm/
>
> I still need to clean it up and write commit messages and comments for all
> wrappers.
>
> Now I think it worth the shot.
>
> Any feedback?

I find it hard to review for correctness, and extremely susceptible to developer
error. E.g. lots of copy+paste, and manual encoding of RCX to expose registers.
It also bleeds TDX ABI into C code, e.g.

/*
* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
* EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
* So copy the register contents back to pt_regs.
*/
regs->ax = args.r12;
regs->bx = args.r13;
regs->cx = args.r14;
regs->dx = args.r15;

Oh, and it requires input/output paramters, which is quite gross for C code *and*
for assembly code, e.g.

u64 tdvmcall_map_gpa(u64 *gpa, u64 size);

and then the accompanying assembly code:

FRAME_BEGIN

save_regs r12,r13

movq (%rdi), %r12
movq %rsi, %r13

movq $(TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13), %rcx

tdvmcall $TDVMCALL_MAP_GPA

movq %r11, (%rdi)

restore_regs r13,r12

FRAME_END
RET

I think having one trampoline makes sense, e.g. to minimize the probability of
leaking register state to the VMM. The part that I don't like, and which generates
awful code, is shoving register state into a memory structure.

The annoying part with the TDX ABI is that it heavily uses r8-r15, and asm()
constraints don't play nice with r8-15. But that doesn't mean we can't use asm()
with macros, it just means we have to play games with registers.

Because REG-REG moves are super cheap, and ignoring the fatal error goofiness,
there are at most four inputs. That means having a single trampoline take *all*
possible inputs is a non-issue. And we can avoiding polluting the inline code if
we bury the register shuffling in the trampoline.

And if we use asm() wrappers to call the trampoline, then the trampoline doesn't
need to precisely follow the C calling convention. I.e. the trampoline can return
with the outputs still in r12-r15, and let the asm() wrappers extract the outputs
they want.

As it stands, TDVMCALLs either have 0, 1, or 4 outputs. I.e. we only need three
asm() wrappers. We could get away with one wrapper, but then users of the wrappers
would need dummy variables for inputs *and* outputs, and the outputs get gross.

Completely untested, but this is what I'm thinking. Side topic, I think making
"tdcall" a macro that takes a leaf is a mistake. If/when an assembler learns what
tdcall is, we're going to have to rewrite all of that code. And what a coincidence,
my suggestion needs a bare TDCALL! :-)

Side topic #2, I don't think the trampoline needs a stack frame, its a leaf function.

Side topic #3, the ud2 to induce panic should be out-of-line.

Weird? Yeah. But at least we one need to document one weird calling convention,
and the ugliness is contained to three macros and a small assembly function.

pushsection .noinstr.text, "ax"
SYM_FUNC_START(tdvmcall_trampoline)
movq $TDX_HYPERCALL_STANDARD, %r10
movq %rax, %r11
movq %rdi, %r12
movq %rsi, %r13
movq %rdx, %r14
movq %rcx, %r15

movq $(TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15), %rcx

tdcall

testq %rax, %rax
jnz .Lpanic

ret

Lpanic:
ud2
SYM_FUNC_END(tdvmcall_trampoline)
popsection


#define TDVMCALL(reason, in1, in2, in3, in4) \
({ \
long __ret; \
\
asm( \
"call tdvmcall_trampoline\n\t" \
"mov %%r10, %0\n\t" \
: "=r" (__ret) \
: "D" (in1), "S"(in2), "d"(in3), "c" (in4) \
: "r12", "r13", "r14", "r15" \
); \
__ret; \
})

#define TDVMCALL_1(reason, in1, in2, in3, in4, out1) \
({ \
long __ret; \
\
asm( \
"call tdvmcall_trampoline\n\t" \
"mov %%r10, %0\n\t" \
"mov %%r12, %1\n\t" \
: "=r"(__ret) "=r" (out1) \
: "a"(reason), "D" (in1), "S"(in2), "d"(in3), "c" (in4) \
: "r12", "r13", "r14", "r15" \
); \
__ret; \
})

#define TDVMCALL_4(reason, in1, in2, in3, in4, out1, out2, out3, out4) \
({ \
long __ret; \
\
asm( \
"call tdvmcall_trampoline\n\t" \
"mov %%r10, %0\n\t" \
"mov %%r12, %1\n\t" \
"mov %%r13, %2\n\t" \
"mov %%r14, %3\n\t" \
"mov %%r15, %4\n\t" \
: "=r" (__ret), \
"=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4) \
: "a"(reason), "D" (in1), "S"(in2), "d"(in3), "c" (in4) \
[reason] "i" (reason) \
: "r12", "r13", "r14", "r15" \
); \
__ret; \
})

static int handle_halt(struct ve_info *ve)
{
if (TDVMCALL(EXIT_REASON_HALT, irqs_disabled(), 0, 0, 0))
return -EIO;

return ve_instr_len(ve);
}

void __cpuidle tdx_safe_halt(void)
{
WARN_ONCE(TDVMCALL(EXIT_REASON_HALT, false, 0, 0, 0),
"HLT instruction emulation failed");
}

static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
u64 val;

if (TDVMCALL_1(EXIT_REASON_MSR_READ, regs->cx, 0, 0, 0, val))
return -EIO;

regs->ax = lower_32_bits(val);
regs->dx = upper_32_bits(val);

return ve_instr_len(ve);
}

static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
u64 val = (u64)regs->dx << 32 | regs->ax;

if (TDVMCALL(EXIT_REASON_MSR_WRITE, regs->cx, val, 0, 0))
return -EIO;

return ve_instr_len(ve);
}
static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
/*
* Only allow VMM to control range reserved for hypervisor
* communication.
*
* Return all-zeros for any CPUID outside the range. It matches CPU
* behaviour for non-supported leaf.
*/
if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
regs->ax = regs->bx = regs->cx = regs->dx = 0;
return ve_instr_len(ve);
}

if (TDVMCALL_4(EXIT_REASON_CPUID, regs->ax, regs->cx, 0, 0,
regs->ax, regs->bx, regs->cx, regs->dx))
return -EIO;

return ve_instr_len(ve);
}

static bool mmio_read(int size, u64 gpa, u64 *val)
{
*val = 0;
return !TDVMCALL_1(EXIT_REASON_EPT_VIOLATION, size, EPT_READ, gpa, 0, val);
}

static bool mmio_write(int size, u64 gpa, u64 val)
{
return !TDVMCALL(EXIT_REASON_EPT_VIOLATION, size, EPT_WRITE, gpa, val);
}