Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with

From: Uladzislau Rezki
Date: Mon Sep 07 2020 - 12:32:04 EST


On Mon, Sep 07, 2020 at 10:45:38AM +0200, Christian Brauner wrote:
> On Sat, Sep 05, 2020 at 12:12:29AM +0000, Isaac J. Manjarres wrote:
> > The per-cpu cached vmalloc'ed stacks are currently freed in the
> > CPU hotplug teardown path by the free_vm_stack_cache() callback,
> > which invokes vfree(), which may result in purging the list of
> > lazily freed vmap areas.
> >
> > Purging all of the lazily freed vmap areas can take a long time
> > when the list of vmap areas is large. This is problematic, as
> > free_vm_stack_cache() is invoked prior to the offline CPU's timers
> > being migrated. This is not desirable as it can lead to timer
> > migration delays in the CPU hotplug teardown path, and timer callbacks
> > will be invoked long after the timer has expired.
> >
> > For example, on a system that has only one online CPU (CPU 1) that is
> > running a heavy workload, and another CPU that is being offlined,
> > the online CPU will invoke free_vm_stack_cache() to free the cached
> > vmalloc'ed stacks for the CPU being offlined. When there are 2702
> > vmap areas that total to 13498 pages, free_vm_stack_cache() takes
> > over 2 seconds to execute:
> >
> > [001] 399.335808: cpuhp_enter: cpu: 0005 target: 0 step: 67 (free_vm_stack_cache)
> >
> > /* The first vmap area to be freed */
> > [001] 399.337157: __purge_vmap_area_lazy: [0:2702] 0xffffffc033da8000 - 0xffffffc033dad000 (5 : 13498)
> >
> > /* After two seconds */
> > [001] 401.528010: __purge_vmap_area_lazy: [1563:2702] 0xffffffc02fe10000 - 0xffffffc02fe15000 (5 : 5765)
> >
> > Instead of freeing the per-cpu cached vmalloc'ed stacks synchronously
> > with respect to the CPU hotplug teardown state machine, free them
> > asynchronously to help move along the CPU hotplug teardown state machine
> > quickly.
> >
> > Signed-off-by: Isaac J. Manjarres <isaacm@xxxxxxxxxxxxxx>
> > ---
>
> This needs reviews and acks from the good folks over at mm/.
>
> > kernel/fork.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 4d32190..68346a0 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -202,7 +202,7 @@ static int free_vm_stack_cache(unsigned int cpu)
> > if (!vm_stack)
> > continue;
> >
> > - vfree(vm_stack->addr);
> > + vfree_atomic(vm_stack->addr);
> > cached_vm_stacks[i] = NULL;
> > }
> >
Could you please also test below patch?

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..cef7a9768f01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -419,6 +419,10 @@ static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;

+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
/*
* This kmem_cache is used for vmap_area objects. Instead of
* allocating from slab we reuse an object from this cache to
@@ -743,7 +747,8 @@ insert_vmap_area_augment(struct vmap_area *va,
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
- struct rb_root *root, struct list_head *head)
+ struct rb_root *root, struct list_head *head,
+ bool update)
{
struct vmap_area *sibling;
struct list_head *next;
@@ -825,7 +830,9 @@ merge_or_add_vmap_area(struct vmap_area *va,
/*
* Last step is to check and update the tree.
*/
- augment_tree_propagate_from(va);
+ if (update)
+ augment_tree_propagate_from(va);
+
return va;
}

@@ -1140,7 +1147,7 @@ static void free_vmap_area(struct vmap_area *va)
* Insert/Merge it back to the free tree/list.
*/
spin_lock(&free_vmap_area_lock);
- merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list, true);
spin_unlock(&free_vmap_area_lock);
}

@@ -1328,32 +1335,33 @@ void set_iounmap_nonlazy(void)
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
- struct llist_node *valist;
- struct vmap_area *va;
- struct vmap_area *n_va;
+ struct list_head local_pure_list;
+ struct vmap_area *va, *n_va;

lockdep_assert_held(&vmap_purge_lock);

- valist = llist_del_all(&vmap_purge_list);
- if (unlikely(valist == NULL))
+ spin_lock(&purge_vmap_area_lock);
+ purge_vmap_area_root = RB_ROOT;
+ list_replace_init(&purge_vmap_area_list, &local_pure_list);
+ spin_unlock(&purge_vmap_area_lock);
+
+ if (unlikely(list_empty(&local_pure_list)))
return false;

- /*
- * TODO: to calculate a flush range without looping.
- * The list can be up to lazy_max_pages() elements.
- */
- llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < start)
- start = va->va_start;
- if (va->va_end > end)
- end = va->va_end;
- }
+
+ start = min(start,
+ list_first_entry(&local_pure_list,
+ struct vmap_area, list)->va_start);
+
+ end = max(end,
+ list_last_entry(&local_pure_list,
+ struct vmap_area, list)->va_end);

flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;

spin_lock(&free_vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
@@ -1364,7 +1372,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* anything.
*/
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
- &free_vmap_area_list);
+ &free_vmap_area_list, true);

if (!va)
continue;
@@ -1421,9 +1429,19 @@ static void free_vmap_area_noflush(struct vmap_area *va)
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);

- /* After this point, we may free va at any time */
- llist_add(&va->purge_list, &vmap_purge_list);
+ /*
+ * But this time the node has been removed from the busy
+ * data structures therefore, merge or place it to the purge
+ * tree/list.
+ */
+ spin_lock(&purge_vmap_area_lock);
+ merge_or_add_vmap_area(va,
+ &purge_vmap_area_root, &purge_vmap_area_list, false);
+ spin_unlock(&purge_vmap_area_lock);

+ /*
+ * After this point, we may free va at any time.
+ */
if (unlikely(nr_lazy > lazy_max_pages()))
try_purge_vmap_area_lazy();
}
@@ -3352,7 +3370,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ &free_vmap_area_list, true);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -3402,7 +3420,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ &free_vmap_area_list, true);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
<snip>

Thank you!

--
Vlad Rezki