Re: [PATCH] zsmalloc: fix a race with deferred_handles storing

From: Nhat Pham
Date: Wed Jan 11 2023 - 14:56:37 EST


On Tue, Jan 10, 2023 at 3:17 PM Nhat Pham <nphamcs@xxxxxxxxx> wrote:
>
> Currently, there is a race between zs_free() and zs_reclaim_page():
> zs_reclaim_page() finds a handle to an allocated object, but before the
> eviction happens, an independent zs_free() call to the same handle could
> come in and overwrite the object value stored at the handle with
> the last deferred handle. When zs_reclaim_page() finally gets to call
> the eviction handler, it will see an invalid object value (i.e the
> previous deferred handle instead of the original object value).
>
> This race happens quite infrequently. We only managed to produce it with
> out-of-tree developmental code that triggers zsmalloc writeback with a
> much higher frequency than usual.
>
> This patch fixes this race by storing the deferred handle in the object
> header instead. We differentiate the deferred handle from the other two
> cases (handle for allocated object, and linkage for free object) with a
> new tag. If zspage reclamation succeeds, we will free these deferred
> handles by walking through the zspage objects. On the other hand, if
> zspage reclamation fails, we reconstruct the zspage freelist (with the
> deferred handle tag and allocated tag) before trying again with the
> reclamation.
>
Fixes: 9997bc017549 ("zsmalloc: implement writeback mechanism for zsmalloc")
I forgot to add the fix tag to this patch.
> Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx>
> ---
> mm/zsmalloc.c | 237 +++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 205 insertions(+), 32 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 9445bee6b014..b4b40ef98703 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -113,7 +113,23 @@
> * have room for two bit at least.
> */
> #define OBJ_ALLOCATED_TAG 1
> -#define OBJ_TAG_BITS 1
> +
> +#ifdef CONFIG_ZPOOL
> +/*
> + * The second least-significant bit in the object's header identifies if the
> + * value stored at the header is a deferred handle from the last reclaim
> + * attempt.
> + *
> + * As noted above, this is valid because we have room for two bits.
> + */
> +#define OBJ_DEFERRED_HANDLE_TAG 2
> +#define OBJ_TAG_BITS 2
> +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
> +#else
> +#define OBJ_TAG_BITS 1
> +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
> +#endif /* CONFIG_ZPOOL */
> +
> #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
> #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>
> @@ -222,6 +238,12 @@ struct link_free {
> * Handle of allocated object.
> */
> unsigned long handle;
> +#ifdef CONFIG_ZPOOL
> + /*
> + * Deferred handle of a reclaimed object.
> + */
> + unsigned long deferred_handle;
> +#endif
> };
> };
>
> @@ -272,8 +294,6 @@ struct zspage {
> /* links the zspage to the lru list in the pool */
> struct list_head lru;
> bool under_reclaim;
> - /* list of unfreed handles whose objects have been reclaimed */
> - unsigned long *deferred_handles;
> #endif
>
> struct zs_pool *pool;
> @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
> return *(unsigned long *)handle;
> }
>
> -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
> + int tag)
> {
> unsigned long handle;
> struct zspage *zspage = get_zspage(page);
> @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> } else
> handle = *(unsigned long *)obj;
>
> - if (!(handle & OBJ_ALLOCATED_TAG))
> + if (!(handle & tag))
> return false;
>
> - *phandle = handle & ~OBJ_ALLOCATED_TAG;
> + /* Clear all tags before returning the handle */
> + *phandle = handle & ~OBJ_TAG_MASK;
> return true;
> }
>
> +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
> +{
> + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
> +}
> +
> +#ifdef CONFIG_ZPOOL
> +static bool obj_stores_deferred_handle(struct page *page, void *obj,
> + unsigned long *phandle)
> +{
> + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
> +}
> +#endif
> +
> static void reset_page(struct page *page)
> {
> __ClearPageMovable(page);
> @@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
> }
>
> #ifdef CONFIG_ZPOOL
> +static unsigned long find_deferred_handle_obj(struct size_class *class,
> + struct page *page, int *obj_idx);
> +
> /*
> * Free all the deferred handles whose objects are freed in zs_free.
> */
> -static void free_handles(struct zs_pool *pool, struct zspage *zspage)
> +static void free_handles(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage)
> {
> - unsigned long handle = (unsigned long)zspage->deferred_handles;
> + int obj_idx = 0;
> + struct page *page = get_first_page(zspage);
> + unsigned long handle;
>
> - while (handle) {
> - unsigned long nxt_handle = handle_to_obj(handle);
> + while (1) {
> + handle = find_deferred_handle_obj(class, page, &obj_idx);
> + if (!handle) {
> + page = get_next_page(page);
> + if (!page)
> + break;
> + obj_idx = 0;
> + continue;
> + }
>
> cache_free_handle(pool, handle);
> - handle = nxt_handle;
> + obj_idx++;
> }
> }
> #else
> -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
> +static inline void free_handles(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage) {}
> #endif
>
> static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> VM_BUG_ON(fg != ZS_EMPTY);
>
> /* Free all deferred handles from zs_free */
> - free_handles(pool, zspage);
> + free_handles(pool, class, zspage);
>
> next = page = get_first_page(zspage);
> do {
> @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
> #ifdef CONFIG_ZPOOL
> INIT_LIST_HEAD(&zspage->lru);
> zspage->under_reclaim = false;
> - zspage->deferred_handles = NULL;
> #endif
>
> set_freeobj(zspage, 0);
> @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> }
> EXPORT_SYMBOL_GPL(zs_malloc);
>
> -static void obj_free(int class_size, unsigned long obj)
> +static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
> {
> struct link_free *link;
> struct zspage *zspage;
> @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
> zspage = get_zspage(f_page);
>
> vaddr = kmap_atomic(f_page);
> -
> - /* Insert this object in containing zspage's freelist */
> link = (struct link_free *)(vaddr + f_offset);
> - if (likely(!ZsHugePage(zspage)))
> - link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
> - else
> - f_page->index = 0;
> +
> + if (handle) {
> +#ifdef CONFIG_ZPOOL
> + /* Stores the (deferred) handle in the object's header */
> + *handle |= OBJ_DEFERRED_HANDLE_TAG;
> + *handle &= ~OBJ_ALLOCATED_TAG;
> +
> + if (likely(!ZsHugePage(zspage)))
> + link->deferred_handle = *handle;
> + else
> + f_page->index = *handle;
> +#endif
> + } else {
> + /* Insert this object in containing zspage's freelist */
> + if (likely(!ZsHugePage(zspage)))
> + link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
> + else
> + f_page->index = 0;
> + set_freeobj(zspage, f_objidx);
> + }
> +
> kunmap_atomic(vaddr);
> - set_freeobj(zspage, f_objidx);
> mod_zspage_inuse(zspage, -1);
> }
>
> @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> zspage = get_zspage(f_page);
> class = zspage_class(pool, zspage);
>
> - obj_free(class->size, obj);
> class_stat_dec(class, OBJ_USED, 1);
>
> #ifdef CONFIG_ZPOOL
> @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> * Reclaim needs the handles during writeback. It'll free
> * them along with the zspage when it's done with them.
> *
> - * Record current deferred handle at the memory location
> - * whose address is given by handle.
> + * Record current deferred handle in the object's header.
> */
> - record_obj(handle, (unsigned long)zspage->deferred_handles);
> - zspage->deferred_handles = (unsigned long *)handle;
> + obj_free(class->size, obj, &handle);
> spin_unlock(&pool->lock);
> return;
> }
> #endif
> + obj_free(class->size, obj, NULL);
> +
> fullness = fix_fullness_group(class, zspage);
> if (fullness == ZS_EMPTY)
> free_zspage(pool, class, zspage);
> @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
> }
>
> /*
> - * Find alloced object in zspage from index object and
> + * Find object with a certain tag in zspage from index object and
> * return handle.
> */
> -static unsigned long find_alloced_obj(struct size_class *class,
> - struct page *page, int *obj_idx)
> +static unsigned long find_tagged_obj(struct size_class *class,
> + struct page *page, int *obj_idx, int tag)
> {
> unsigned int offset;
> int index = *obj_idx;
> @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
> offset += class->size * index;
>
> while (offset < PAGE_SIZE) {
> - if (obj_allocated(page, addr + offset, &handle))
> + if (obj_tagged(page, addr + offset, &handle, tag))
> break;
>
> offset += class->size;
> @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
> return handle;
> }
>
> +/*
> + * Find alloced object in zspage from index object and
> + * return handle.
> + */
> +static unsigned long find_alloced_obj(struct size_class *class,
> + struct page *page, int *obj_idx)
> +{
> + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
> +}
> +
> +#ifdef CONFIG_ZPOOL
> +/*
> + * Find object storing a deferred handle in header in zspage from index object
> + * and return handle.
> + */
> +static unsigned long find_deferred_handle_obj(struct size_class *class,
> + struct page *page, int *obj_idx)
> +{
> + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
> +}
> +#endif
> +
> struct zs_compact_control {
> /* Source spage for migration which could be a subpage of zspage */
> struct page *s_page;
> @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> zs_object_copy(class, free_obj, used_obj);
> obj_idx++;
> record_obj(handle, free_obj);
> - obj_free(class->size, used_obj);
> + obj_free(class->size, used_obj, NULL);
> }
>
> /* Remember last position in this iteration */
> @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
> EXPORT_SYMBOL_GPL(zs_destroy_pool);
>
> #ifdef CONFIG_ZPOOL
> +static void restore_freelist(struct zs_pool *pool, struct size_class *class,
> + struct zspage *zspage)
> +{
> + unsigned int obj_idx = 0;
> + unsigned long handle, off = 0; /* off is within-page offset */
> + struct page *page = get_first_page(zspage);
> + struct link_free *prev_free = NULL;
> + void *prev_page_vaddr = NULL;
> +
> + /* in case no free object found */
> + set_freeobj(zspage, (unsigned int)(-1UL));
> +
> + while (page) {
> + void *vaddr = kmap_atomic(page);
> + struct page *next_page;
> +
> + while (off < PAGE_SIZE) {
> + void *obj_addr = vaddr + off;
> +
> + /* skip allocated object */
> + if (obj_allocated(page, obj_addr, &handle)) {
> + obj_idx++;
> + off += class->size;
> + continue;
> + }
> +
> + /* free deferred handle from reclaim attempt */
> + if (obj_stores_deferred_handle(page, obj_addr, &handle))
> + cache_free_handle(pool, handle);
> +
> + if (prev_free)
> + prev_free->next = obj_idx << OBJ_TAG_BITS;
> + else /* first free object found */
> + set_freeobj(zspage, obj_idx);
> +
> + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
> + /* if last free object in a previous page, need to unmap */
> + if (prev_page_vaddr) {
> + kunmap_atomic(prev_page_vaddr);
> + prev_page_vaddr = NULL;
> + }
> +
> + obj_idx++;
> + off += class->size;
> + }
> +
> + /*
> + * Handle the last (full or partial) object on this page.
> + */
> + next_page = get_next_page(page);
> + if (next_page) {
> + if (!prev_free || prev_page_vaddr) {
> + /*
> + * There is no free object in this page, so we can safely
> + * unmap it.
> + */
> + kunmap_atomic(vaddr);
> + } else {
> + /* update prev_page_vaddr since prev_free is on this page */
> + prev_page_vaddr = vaddr;
> + }
> + } else { /* this is the last page */
> + if (prev_free) {
> + /*
> + * Reset OBJ_TAG_BITS bit to last link to tell
> + * whether it's allocated object or not.
> + */
> + prev_free->next = -1UL << OBJ_TAG_BITS;
> + }
> +
> + /* unmap previous page (if not done yet) */
> + if (prev_page_vaddr) {
> + kunmap_atomic(prev_page_vaddr);
> + prev_page_vaddr = NULL;
> + }
> +
> + kunmap_atomic(vaddr);
> + }
> +
> + page = next_page;
> + off %= PAGE_SIZE;
> + }
> +}
> +
> static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> {
> int i, obj_idx, ret = 0;
> @@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> return 0;
> }
>
> + /*
> + * Eviction fails on one of the handles, so we need to restore zspage.
> + * We need to rebuild its freelist (and free stored deferred handles),
> + * put it back to the correct size class, and add it to the LRU list.
> + */
> + restore_freelist(pool, class, zspage);
> putback_zspage(class, zspage);
> list_add(&zspage->lru, &pool->lru);
> unlock_zspage(zspage);
> --
> 2.30.2
>