Re: [PATCH v4 3/3] x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()

From: Jarkko Sakkinen
Date: Sun Mar 14 2021 - 07:58:44 EST


On Sat, Mar 13, 2021 at 06:01:19PM +0200, Jarkko Sakkinen wrote:
> Background
> ==========
>
> EPC section is covered by one or more SRAT entries that are associated with
> one and only one PXM (NUMA node). The motivation behind this patch is to
> provide basic elements of building allocation scheme based on this premise.
>
> Just like normal RAM, enclave memory (EPC) should be covered by entries
> in the ACPI SRAT table. These entries allow each EPC section to be
> associated with a NUMA node.
>
> Use this information to implement a simple NUMA-aware allocator for
> enclave memory.
>
> Solution
> ========
>
> Use phys_to_target_node() to associate each NUMA node with the EPC
> sections contained within its range. In sgx_alloc_epc_page(), first try
> to allocate from the NUMA node, where the CPU is executing. If that
> fails, allocate from other nodes, iterating them from the current node
> in order.
>
> Other
> =====
>
> NUMA_KEEP_MEMINFO dependency is required for phys_to_target_node().
>
> Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
> Signed-off-by: Jarkko Sakkinen <jarkko@xxxxxxxxxx>

We *can* have also epc_page->node by:

- Considering the first section as the EPC of that node.
- Printing a warning if more sections hit the same node, and
ignoring them.
- Merging sgx_numa_node and sgx_epc_section

I think this would be a decent idea. I think it's a sane assumption that
a node has a single EPC section, but it's good to have that warning just
in case.

I did not want to do this into this version because it's faster for me
to refactor into this assumption than revert back.

/Jarkko

> ---
>
> v4:
> * Cycle nodes instead of a global page list, starting from the node
> of the current thread.
> * Documented NUMA_KEEP_MEMINFO dependency to the commit message.
> * Added NUMA node pointer to struct sgx_epc_section. EPC page should
> reference to a section, since potentially a node could have multiple
> sections (Intel SDM does not say anything explicit about this).
> This the safest play.
> * Remove nodes_clear(sgx_numa_node_mask).
> * Appended Dave's additions to the commit message for the background
> section.
>
> arch/x86/Kconfig | 1 +
> arch/x86/kernel/cpu/sgx/main.c | 117 ++++++++++++++++++++-------------
> arch/x86/kernel/cpu/sgx/sgx.h | 16 +++--
> 3 files changed, 84 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 513895af8ee7..3e6152a8dd2b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1930,6 +1930,7 @@ config X86_SGX
> depends on CRYPTO_SHA256=y
> select SRCU
> select MMU_NOTIFIER
> + select NUMA_KEEP_MEMINFO if NUMA
> help
> Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
> that can be used by applications to set aside private regions of code
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index cb4561444b96..3b524a1361d6 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -18,14 +18,23 @@ static int sgx_nr_epc_sections;
> static struct task_struct *ksgxd_tsk;
> static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
>
> -/*
> - * These variables are part of the state of the reclaimer, and must be accessed
> - * with sgx_reclaimer_lock acquired.
> - */
> +/* The reclaimer lock protected variables prepend the lock. */
> static LIST_HEAD(sgx_active_page_list);
> -
> static DEFINE_SPINLOCK(sgx_reclaimer_lock);
>
> +/* The free page list lock protected variables prepend the lock. */
> +static unsigned long sgx_nr_free_pages;
> +
> +/* Nodes with one or more EPC sections. */
> +static nodemask_t sgx_numa_mask;
> +
> +/*
> + * Array with one list_head for each possible NUMA node. Each
> + * list contains all the sgx_epc_section's which are on that
> + * node.
> + */
> +static struct sgx_numa_node *sgx_numa_nodes;
> +
> /*
> * When the driver initialized, EPC pages go first here, as they could be
> * initialized to an active enclave, on kexec entry.
> @@ -352,21 +361,9 @@ static void sgx_reclaim_pages(void)
> }
> }
>
> -static unsigned long sgx_nr_free_pages(void)
> -{
> - unsigned long cnt = 0;
> - int i;
> -
> - for (i = 0; i < sgx_nr_epc_sections; i++)
> - cnt += sgx_epc_sections[i].free_cnt;
> -
> - return cnt;
> -}
> -
> static bool sgx_should_reclaim(unsigned long watermark)
> {
> - return sgx_nr_free_pages() < watermark &&
> - !list_empty(&sgx_active_page_list);
> + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
> }
>
> static int ksgxd(void *p)
> @@ -443,50 +440,63 @@ static bool __init sgx_page_reclaimer_init(void)
> return true;
> }
>
> -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
> +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
> {
> - struct sgx_epc_page *page;
> + struct sgx_numa_node *node = &sgx_numa_nodes[nid];
> + struct sgx_epc_page *page = NULL;
> +
> + if (!node_isset(nid, sgx_numa_mask))
> + return NULL;
>
> - spin_lock(&section->lock);
> + spin_lock(&node->lock);
>
> - if (list_empty(&section->page_list)) {
> - spin_unlock(&section->lock);
> + if (list_empty(&node->free_page_list)) {
> + spin_unlock(&node->lock);
> return NULL;
> }
>
> - page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
> + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
> list_del_init(&page->list);
> - section->free_cnt--;
> + sgx_nr_free_pages--;
> +
> + spin_unlock(&node->lock);
>
> - spin_unlock(&section->lock);
> return page;
> }
>
> /**
> * __sgx_alloc_epc_page() - Allocate an EPC page
> *
> - * Iterate through EPC sections and borrow a free EPC page to the caller. When a
> - * page is no longer needed it must be released with sgx_free_epc_page().
> + * Iterate through NUMA nodes and borrow a free EPC page to the caller. When a
> + * page is no longer needed it must be released with sgx_free_epc_page(). Start
> + * from the NUMA node, where the caller is executing.
> *
> * Return:
> - * an EPC page,
> - * -errno on error
> + * - an EPC page: Free EPC pages were available.
> + * - ERR_PTR(-ENOMEM): Run out of EPC pages.
> */
> struct sgx_epc_page *__sgx_alloc_epc_page(void)
> {
> - struct sgx_epc_section *section;
> struct sgx_epc_page *page;
> - int i;
> + int nid = numa_node_id();
>
> - for (i = 0; i < sgx_nr_epc_sections; i++) {
> - section = &sgx_epc_sections[i];
> + /* Try to allocate EPC from the current node, first: */
> + page = __sgx_alloc_epc_page_from_node(nid);
> + if (page)
> + return page;
>
> - page = __sgx_alloc_epc_page_from_section(section);
> + /* Then, go through the other nodes: */
> + while (true) {
> + nid = next_node_in(nid, sgx_numa_mask);
> + if (nid == numa_node_id())
> + break;
> +
> + page = __sgx_alloc_epc_page_from_node(nid);
> if (page)
> - return page;
> + break;
> }
>
> - return ERR_PTR(-ENOMEM);
> + return page;
> }
>
> /**
> @@ -592,6 +602,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
> void sgx_free_epc_page(struct sgx_epc_page *page)
> {
> struct sgx_epc_section *section = &sgx_epc_sections[page->section];
> + struct sgx_numa_node *node = section->node;
> int ret;
>
> WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
> @@ -600,10 +611,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
> if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
> return;
>
> - spin_lock(&section->lock);
> - list_add_tail(&page->list, &section->page_list);
> - section->free_cnt++;
> - spin_unlock(&section->lock);
> + spin_lock(&node->lock);
> +
> + list_add_tail(&page->list, &node->free_page_list);
> + sgx_nr_free_pages++;
> +
> + spin_unlock(&node->lock);
> }
>
> static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
> @@ -624,8 +637,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
> }
>
> section->phys_addr = phys_addr;
> - spin_lock_init(&section->lock);
> - INIT_LIST_HEAD(&section->page_list);
>
> for (i = 0; i < nr_pages; i++) {
> section->pages[i].section = index;
> @@ -634,7 +645,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
> list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
> }
>
> - section->free_cnt = nr_pages;
> + sgx_nr_free_pages += nr_pages;
> return true;
> }
>
> @@ -653,8 +664,11 @@ static bool __init sgx_page_cache_init(void)
> {
> u32 eax, ebx, ecx, edx, type;
> u64 pa, size;
> + int nid;
> int i;
>
> + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
> +
> for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
> cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
>
> @@ -677,6 +691,21 @@ static bool __init sgx_page_cache_init(void)
> break;
> }
>
> + nid = numa_map_to_online_node(phys_to_target_node(pa));
> + if (nid == NUMA_NO_NODE) {
> + /* The physical address is already printed above. */
> + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
> + nid = 0;
> + }
> +
> + if (!node_isset(nid, sgx_numa_mask)) {
> + spin_lock_init(&sgx_numa_nodes[nid].lock);
> + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
> + node_set(nid, sgx_numa_mask);
> + }
> +
> + sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
> +
> sgx_nr_epc_sections++;
> }
>
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index bc8af0428640..653af8ca1a25 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -29,22 +29,26 @@ struct sgx_epc_page {
> struct list_head list;
> };
>
> +/*
> + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
> + * the free page list local to the node is stored here.
> + */
> +struct sgx_numa_node {
> + struct list_head free_page_list;
> + spinlock_t lock;
> +};
> +
> /*
> * The firmware can define multiple chunks of EPC to the different areas of the
> * physical memory e.g. for memory areas of the each node. This structure is
> * used to store EPC pages for one EPC section and virtual memory area where
> * the pages have been mapped.
> - *
> - * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
> */
> struct sgx_epc_section {
> unsigned long phys_addr;
> void *virt_addr;
> struct sgx_epc_page *pages;
> -
> - spinlock_t lock;
> - struct list_head page_list;
> - unsigned long free_cnt;
> + struct sgx_numa_node *node;
> };
>
> extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> --
> 2.30.2
>