RE: [PATCH v5 09/16] x86/hyperv: provide a bunch of helper functions

From: Michael Kelley
Date: Tue Jan 26 2021 - 05:31:10 EST


From: Wei Liu <wei.liu@xxxxxxxxxx> Sent: Wednesday, January 20, 2021 4:01 AM
>
> They are used to deposit pages into Microsoft Hypervisor and bring up
> logical and virtual processors.
>
> Signed-off-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
> Signed-off-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx>
> Signed-off-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx>
> Co-Developed-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
> Co-Developed-by: Sunil Muthuswamy <sunilmut@xxxxxxxxxxxxx>
> Co-Developed-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Wei Liu <wei.liu@xxxxxxxxxx>
> ---
> v4: Fix compilation issue when CONFIG_ACPI_NUMA is not set.
>
> v3:
> 1. Add __packed to structures.
> 2. Drop unnecessary exports.
>
> v2:
> 1. Adapt to hypervisor side changes
> 2. Address Vitaly's comments
> ---
> arch/x86/hyperv/Makefile | 2 +-
> arch/x86/hyperv/hv_proc.c | 225 ++++++++++++++++++++++++++++++
> arch/x86/include/asm/mshyperv.h | 4 +
> include/asm-generic/hyperv-tlfs.h | 67 +++++++++
> 4 files changed, 297 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/hyperv/hv_proc.c
>
> diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
> index 89b1f74d3225..565358020921 100644
> --- a/arch/x86/hyperv/Makefile
> +++ b/arch/x86/hyperv/Makefile
> @@ -1,6 +1,6 @@
> # SPDX-License-Identifier: GPL-2.0-only
> obj-y := hv_init.o mmu.o nested.o
> -obj-$(CONFIG_X86_64) += hv_apic.o
> +obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
>
> ifdef CONFIG_X86_64
> obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
> diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
> new file mode 100644
> index 000000000000..706097160e2f
> --- /dev/null
> +++ b/arch/x86/hyperv/hv_proc.c
> @@ -0,0 +1,225 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/types.h>
> +#include <linux/version.h>
> +#include <linux/vmalloc.h>
> +#include <linux/mm.h>
> +#include <linux/clockchips.h>
> +#include <linux/acpi.h>
> +#include <linux/hyperv.h>
> +#include <linux/slab.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/minmax.h>
> +#include <asm/hypervisor.h>
> +#include <asm/mshyperv.h>
> +#include <asm/apic.h>
> +
> +#include <asm/trace/hyperv.h>
> +
> +#define HV_DEPOSIT_MAX_ORDER (8)
> +#define HV_DEPOSIT_MAX (1 << HV_DEPOSIT_MAX_ORDER)

Is there any reason to not let the maximum be 511, which is
how many entries will fit on the hypercall input page? The
max could be define in terms of HY_HYP_PAGE_SIZE so that
the logical dependency is fully expressed.

> +
> +/*
> + * Deposits exact number of pages
> + * Must be called with interrupts enabled
> + * Max 256 pages
> + */
> +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> +{
> + struct page **pages;
> + int *counts;
> + int num_allocations;
> + int i, j, page_count;
> + int order;
> + int desired_order;
> + u16 status;
> + int ret;
> + u64 base_pfn;
> + struct hv_deposit_memory *input_page;
> + unsigned long flags;
> +
> + if (num_pages > HV_DEPOSIT_MAX)
> + return -E2BIG;
> + if (!num_pages)
> + return 0;
> +
> + /* One buffer for page pointers and counts */
> + pages = page_address(alloc_page(GFP_KERNEL));
> + if (!pages)

Does the above check work? If alloc_pages() returns NULL, it looks like
page_address() might fault.

> + return -ENOMEM;
> +
> + counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
> + if (!counts) {
> + free_page((unsigned long)pages);
> + return -ENOMEM;
> + }
> +
> + /* Allocate all the pages before disabling interrupts */
> + num_allocations = 0;
> + i = 0;
> + order = HV_DEPOSIT_MAX_ORDER;
> +
> + while (num_pages) {
> + /* Find highest order we can actually allocate */
> + desired_order = 31 - __builtin_clz(num_pages);
> + order = min(desired_order, order);

The above seems redundant since request sizes larger than the
max have already been rejected.

> + do {
> + pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
> + if (!pages[i]) {
> + if (!order) {
> + ret = -ENOMEM;
> + goto err_free_allocations;
> + }
> + --order;
> + }
> + } while (!pages[i]);

The duplicative test of !pages[i] is somewhat annoying. How about
this:

while{!pages[i] = alloc_pages_node(node, GFP_KERNEL, order) {
if (!order) {
ret = -ENOMEM;
goto err_free_allocations;
}
--order;
}

or if you don't like doing an assignment in the while test:

while(1) {
pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
if (page[i])
break;
if (!order) {
ret = -ENOMEM;
goto err_free_allocations;
}
--order;
}

> +
> + split_page(pages[i], order);
> + counts[i] = 1 << order;
> + num_pages -= counts[i];
> + i++;
> + num_allocations++;

Incrementing both I and num_allocations in the loop seems
redundant, especially since num_allocations isn't used in the loop.
Could num_allocations be assigned the value of i once the loop
is exited? (and num_allocations would not need to be initialized to 0.)
Would also have to do the assignment in the error case.

> + }
> +
> + local_irq_save(flags);
> +
> + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +
> + input_page->partition_id = partition_id;
> +
> + /* Populate gpa_page_list - these will fit on the input page */
> + for (i = 0, page_count = 0; i < num_allocations; ++i) {
> + base_pfn = page_to_pfn(pages[i]);
> + for (j = 0; j < counts[i]; ++j, ++page_count)
> + input_page->gpa_page_list[page_count] = base_pfn + j;
> + }
> + status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
> + page_count, 0, input_page,
> + NULL) & HV_HYPERCALL_RESULT_MASK;

Similar comment about how hypercall status is checked.

> + local_irq_restore(flags);
> +
> + if (status != HV_STATUS_SUCCESS) {
> + pr_err("Failed to deposit pages: %d\n", status);
> + ret = status;
> + goto err_free_allocations;
> + }
> +
> + ret = 0;
> + goto free_buf;
> +
> +err_free_allocations:
> + for (i = 0; i < num_allocations; ++i) {
> + base_pfn = page_to_pfn(pages[i]);
> + for (j = 0; j < counts[i]; ++j)
> + __free_page(pfn_to_page(base_pfn + j));
> + }
> +
> +free_buf:
> + free_page((unsigned long)pages);
> + kfree(counts);
> + return ret;
> +}
> +
> +int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
> +{
> + struct hv_add_logical_processor_in *input;
> + struct hv_add_logical_processor_out *output;
> + int status;
> + unsigned long flags;
> + int ret = 0;
> +#ifdef CONFIG_ACPI_NUMA
> + int pxm = node_to_pxm(node);
> +#else
> + int pxm = 0;
> +#endif

It seems like the above #ifdef'ery might be better fixed in
include/acpi/acpi_numa.h, where there's already a null definition
of pxm_to_node() in case CONFIG_ACPI_NUMA isn't defined. There
should also be a null definition of node_to_pxm() in that file.

> +
> + /*
> + * When adding a logical processor, the hypervisor may return
> + * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
> + * pages and retry.
> + */
> + do {
> + local_irq_save(flags);
> +
> + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> + /* We don't do anything with the output right now */
> + output = *this_cpu_ptr(hyperv_pcpu_output_arg);
> +
> + input->lp_index = lp_index;
> + input->apic_id = apic_id;
> + input->flags = 0;
> + input->proximity_domain_info.domain_id = pxm;
> + input->proximity_domain_info.flags.reserved = 0;
> + input->proximity_domain_info.flags.proximity_info_valid = 1;
> + input->proximity_domain_info.flags.proximity_preferred = 1;
> + status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
> + input, output);
> + local_irq_restore(flags);
> +
> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) {

The 'and' with HV_HYPERCALL_RESULT_MASK isn't coded anywhere for this
hypercall, and 'status' is declared as 'int'.

> + if (status != HV_STATUS_SUCCESS) {
> + pr_err("%s: cpu %u apic ID %u, %d\n", __func__,
> + lp_index, apic_id, status);
> + ret = status;
> + }
> + break;
> + }
> + ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
> + } while (!ret);
> +
> + return ret;
> +}
> +
> +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
> +{
> + struct hv_create_vp *input;
> + u16 status;
> + unsigned long irq_flags;
> + int ret = 0;
> +#ifdef CONFIG_ACPI_NUMA
> + int pxm = node_to_pxm(node);
> +#else
> + int pxm = 0;
> +#endif

Same comment.

> +
> + /* Root VPs don't seem to need pages deposited */
> + if (partition_id != hv_current_partition_id) {
> + ret = hv_call_deposit_pages(node, partition_id, 90);

Perhaps add a comment about the value "90". Was it
empirically determined?

> + if (ret)
> + return ret;
> + }
> +
> + do {
> + local_irq_save(irq_flags);
> +
> + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +
> + input->partition_id = partition_id;
> + input->vp_index = vp_index;
> + input->flags = flags;
> + input->subnode_type = HvSubnodeAny;
> + if (node != NUMA_NO_NODE) {
> + input->proximity_domain_info.domain_id = pxm;
> + input->proximity_domain_info.flags.reserved = 0;
> + input->proximity_domain_info.flags.proximity_info_valid = 1;
> + input->proximity_domain_info.flags.proximity_preferred = 1;
> + } else {
> + input->proximity_domain_info.as_uint64 = 0;
> + }
> + status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
> + local_irq_restore(irq_flags);
> +
> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) {

Same problems with the status check.

> + if (status != HV_STATUS_SUCCESS) {
> + pr_err("%s: vcpu %u, lp %u, %d\n", __func__,
> + vp_index, flags, status);
> + ret = status;
> + }
> + break;
> + }
> + ret = hv_call_deposit_pages(node, partition_id, 1);
> +
> + } while (!ret);
> +
> + return ret;
> +}
> +
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index 67f5d35a73d3..4e590a167160 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -80,6 +80,10 @@ extern void __percpu **hyperv_pcpu_output_arg;
>
> extern u64 hv_current_partition_id;
>
> +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
> +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
> +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> +
> static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
> {
> u64 input_address = input ? virt_to_phys(input) : 0;
> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
> index 87b1a79b19eb..ec53570102f0 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -142,6 +142,8 @@ struct ms_hyperv_tsc_page {
> #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
> #define HVCALL_SEND_IPI_EX 0x0015
> #define HVCALL_GET_PARTITION_ID 0x0046
> +#define HVCALL_DEPOSIT_MEMORY 0x0048
> +#define HVCALL_CREATE_VP 0x004e
> #define HVCALL_GET_VP_REGISTERS 0x0050
> #define HVCALL_SET_VP_REGISTERS 0x0051
> #define HVCALL_POST_MESSAGE 0x005c
> @@ -149,6 +151,7 @@ struct ms_hyperv_tsc_page {
> #define HVCALL_POST_DEBUG_DATA 0x0069
> #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a
> #define HVCALL_RESET_DEBUG_SESSION 0x006b
> +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076
> #define HVCALL_RETARGET_INTERRUPT 0x007e
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
> @@ -413,6 +416,70 @@ struct hv_get_partition_id {
> u64 partition_id;
> } __packed;
>
> +/* HvDepositMemory hypercall */
> +struct hv_deposit_memory {
> + u64 partition_id;
> + u64 gpa_page_list[];
> +} __packed;
> +
> +struct hv_proximity_domain_flags {
> + u32 proximity_preferred : 1;
> + u32 reserved : 30;
> + u32 proximity_info_valid : 1;
> +} __packed;
> +
> +/* Not a union in windows but useful for zeroing */
> +union hv_proximity_domain_info {
> + struct {
> + u32 domain_id;
> + struct hv_proximity_domain_flags flags;
> + };
> + u64 as_uint64;
> +} __packed;
> +
> +struct hv_lp_startup_status {
> + u64 hv_status;
> + u64 substatus1;
> + u64 substatus2;
> + u64 substatus3;
> + u64 substatus4;
> + u64 substatus5;
> + u64 substatus6;
> +} __packed;
> +
> +/* HvAddLogicalProcessor hypercall */
> +struct hv_add_logical_processor_in {
> + u32 lp_index;
> + u32 apic_id;
> + union hv_proximity_domain_info proximity_domain_info;
> + u64 flags;
> +};

__packed is missing from this struct definition

> +
> +struct hv_add_logical_processor_out {
> + struct hv_lp_startup_status startup_status;
> +} __packed;
> +
> +enum HV_SUBNODE_TYPE
> +{
> + HvSubnodeAny = 0,
> + HvSubnodeSocket,
> + HvSubnodeAmdNode,
> + HvSubnodeL3,
> + HvSubnodeCount,
> + HvSubnodeInvalid = -1
> +};

Are these values defined by Hyper-V? If so, explicitly coding the
value of each enum member might be better.

> +
> +/* HvCreateVp hypercall */
> +struct hv_create_vp {
> + u64 partition_id;
> + u32 vp_index;
> + u8 padding[3];
> + u8 subnode_type;
> + u64 subnode_id;
> + union hv_proximity_domain_info proximity_domain_info;
> + u64 flags;
> +} __packed;
> +
> /* HvRetargetDeviceInterrupt hypercall */
> union hv_msi_entry {
> u64 as_uint64;
> --
> 2.20.1