[RFC][PATCH v3 1/5] mm/zsmalloc: introduce class auto-compaction

From: Sergey Senozhatsky
Date: Thu Mar 03 2016 - 09:48:27 EST


zsmalloc classes are known to be affected by internal fragmentation.

For example, /sys/kernel/debug/zsmalloc/zramX/classes
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
54 896 1 12 117 57 26 2 12
...
107 1744 1 23 196 76 84 3 51
111 1808 0 0 63 63 28 4 0
126 2048 0 160 568 408 284 1 80
144 2336 52 620 8631 5747 4932 4 1648
151 2448 123 406 10090 8736 6054 3 810
168 2720 0 512 15738 14926 10492 2 540
190 3072 0 2 136 130 102 3 3
...

demonstrates that class-896 has 12/26=46% of unused pages, class-2336 has
1648/4932=33% of unused pages, etc. And the more classes we will have as
'normal' classes (more than one object per-zspage) the bigger this problem
will grow. The existing compaction relies on a user space (user can trigger
compaction via `compact' zram's sysfs attr) or a shrinker; it does not
happen automatically.

This patch introduces a 'watermark' value of unused pages and schedules a
compaction work on a per-class basis once class's fragmentation becomes
too big. So compaction is not performed in current I/O operation context,
but in workqueue workers later.

The current watermark is set to 40% -- if class has 40+% of `freeable'
pages then compaction work will be scheduled.

TEST
====

2G zram, ext4, lz0

iozone -t 1 -R -r 64K -s 1200M -I +Z

BASE PATCHED
" Initial write " 959670.94 966724.62
" Rewrite " 1276167.62 1237632.88
" Read " 3334708.25 3345357.50
" Re-read " 3405310.75 3337137.25
" Reverse Read " 3284499.75 3241283.50
" Stride read " 3293417.75 3268364.00
" Random read " 3255253.50 3241685.00
" Mixed workload " 3274398.00 3231498.00
" Random write " 1253207.50 1216247.00
" Pwrite " 873682.25 877045.81
" Pread " 3173266.00 3318471.75
" Fwrite " 881278.38 897622.81
" Fread " 4397147.00 4501131.50

iozone -t 3 -R -r 64K -s 60M -I +Z

BASE PATCHED
" Initial write " 1855931.62 1869576.31
" Rewrite " 2223531.06 2221543.62
" Read " 7958435.75 8023044.75
" Re-read " 7912776.75 8068961.00
" Reverse Read " 7832227.50 7788237.50
" Stride read " 7952113.50 7919778.00
" Random read " 7908816.00 7881792.50
" Mixed workload " 6364520.38 6332493.94
" Random write " 2230115.69 2176777.19
" Pwrite " 1915939.31 1929464.75
" Pread " 3857052.91 3840517.91
" Fwrite " 2271730.44 2272800.31
" Fread " 9053867.00 8880966.25

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@xxxxxxxxx>
---
mm/zsmalloc.c | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e72efb1..a4ef7e7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -219,6 +219,10 @@ struct size_class {
int pages_per_zspage;
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
bool huge;
+
+ bool compact_scheduled;
+ struct zs_pool *pool;
+ struct work_struct compact_work;
};

/*
@@ -1467,6 +1471,8 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
zs_stat_dec(class, OBJ_USED, 1);
}

+static bool class_watermark_ok(struct size_class *class);
+
void zs_free(struct zs_pool *pool, unsigned long handle)
{
struct page *first_page, *f_page;
@@ -1495,6 +1501,11 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
free_zspage(first_page);
+ } else {
+ if (!class_watermark_ok(class) && !class->compact_scheduled) {
+ queue_work(system_long_wq, &class->compact_work);
+ class->compact_scheduled = true;
+ }
}
spin_unlock(&class->lock);
unpin_tag(handle);
@@ -1745,6 +1756,19 @@ static unsigned long zs_can_compact(struct size_class *class)
return obj_wasted * class->pages_per_zspage;
}

+static bool class_watermark_ok(struct size_class *class)
+{
+ unsigned long pages_used = zs_stat_get(class, OBJ_ALLOCATED);
+
+ pages_used /= get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage) * class->pages_per_zspage;
+
+ if (!pages_used)
+ return true;
+
+ return (100 * zs_can_compact(class) / pages_used) < 40;
+}
+
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
struct zs_compact_control cc;
@@ -1789,9 +1813,17 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
if (src_page)
putback_zspage(pool, class, src_page);

+ class->compact_scheduled = false;
spin_unlock(&class->lock);
}

+static void class_compaction_work(struct work_struct *work)
+{
+ struct size_class *class = container_of(work, struct size_class, compact_work);
+
+ __zs_compact(class->pool, class);
+}
+
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
@@ -1948,6 +1980,9 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
if (pages_per_zspage == 1 &&
get_maxobj_per_zspage(size, pages_per_zspage) == 1)
class->huge = true;
+
+ INIT_WORK(&class->compact_work, class_compaction_work);
+ class->pool = pool;
spin_lock_init(&class->lock);
pool->size_class[i] = class;

@@ -1990,6 +2025,8 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;

+ cancel_work_sync(&class->compact_work);
+
for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
if (class->fullness_list[fg]) {
pr_info("Freeing non-empty class with size %db, fullness group %d\n",
--
2.8.0.rc0