Re: [PATCH 2/4] virtual block device driver (ramzswap)

From: Pekka Enberg
Date: Mon Sep 14 2009 - 16:10:42 EST


Hi Nitin,

I am not a block driver expert but here are some comments on the code
that probably need to be addressed before merging.

Pekka

On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@xxxxxxxxxx> wrote:
> @@ -0,0 +1,1529 @@
> +/*
> + * Compressed RAM based swap device
> + *
> + * Copyright (C) 2008, 2009  Nitin Gupta
> + *
> + * This code is released using a dual license strategy: BSD/GPL
> + * You can choose the licence that better fits your requirements.
> + *
> + * Released under the terms of 3-clause BSD License
> + * Released under the terms of GNU General Public License Version 2.0
> + *
> + * Project home: http://compcache.googlecode.com
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/bitops.h>
> +#include <linux/blkdev.h>
> +#include <linux/buffer_head.h>
> +#include <linux/device.h>
> +#include <linux/genhd.h>
> +#include <linux/highmem.h>
> +#include <linux/lzo.h>
> +#include <linux/marker.h>
> +#include <linux/mutex.h>
> +#include <linux/string.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <linux/vmalloc.h>
> +#include <linux/version.h>
> +
> +#include "ramzswap_drv.h"
> +
> +/* Globals */
> +static int RAMZSWAP_MAJOR;
> +static struct ramzswap *DEVICES;
> +
> +/*
> + * Pages that compress to larger than this size are
> + * forwarded to backing swap, if present or stored
> + * uncompressed in memory otherwise.
> + */
> +static unsigned int MAX_CPAGE_SIZE;
> +
> +/* Module params (documentation at end) */
> +static unsigned long NUM_DEVICES;

These variable names should be in lower case.

> +
> +/* Function declarations */
> +static int __init ramzswap_init(void);
> +static int ramzswap_ioctl(struct block_device *, fmode_t,
> +                       unsigned, unsigned long);
> +static int setup_swap_header(struct ramzswap *, union swap_header *);
> +static void ramzswap_set_memlimit(struct ramzswap *, size_t);
> +static void ramzswap_set_disksize(struct ramzswap *, size_t);
> +static void reset_device(struct ramzswap *rzs);

It's preferable not to use forward declarations in new kernel code.

> +static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       return rzs->table[index].flags & BIT(flag);
> +}
> +
> +static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags |= BIT(flag);
> +}
> +
> +static void clear_flag(struct ramzswap *rzs, u32 index,
> +                                       enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags &= ~BIT(flag);
> +}

These function names could use a ramzswap specific prefix.

> +
> +static int page_zero_filled(void *ptr)
> +{
> +       u32 pos;
> +       u64 *page;
> +
> +       page = (u64 *)ptr;
> +
> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
> +               if (page[pos])
> +                       return 0;
> +       }
> +
> +       return 1;
> +}

This looks like something that could be in lib/string.c.

/me looks

There's strspn so maybe you could introduce a memspn equivalent.

> +
> +/*
> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
> + */
> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
> +{
> +       unsigned char *base;
> +
> +       base = kmap_atomic(page, type);
> +       return base + offset;
> +}
> +
> +static void put_ptr_atomic(void *ptr, enum km_type type)
> +{
> +       kunmap_atomic(ptr, type);
> +}

These two functions also appear in xmalloc. It's probably best to just
kill the wrappers and use kmap/kunmap directly.

> +
> +static void ramzswap_flush_dcache_page(struct page *page)
> +{
> +#ifdef CONFIG_ARM
> +       int flag = 0;
> +       /*
> +        * Ugly hack to get flush_dcache_page() work on ARM.
> +        * page_mapping(page) == NULL after clearing this swap cache flag.
> +        * Without clearing this flag, flush_dcache_page() will simply set
> +        * "PG_dcache_dirty" bit and return.
> +        */
> +       if (PageSwapCache(page)) {
> +               flag = 1;
> +               ClearPageSwapCache(page);
> +       }
> +#endif
> +       flush_dcache_page(page);
> +#ifdef CONFIG_ARM
> +       if (flag)
> +               SetPageSwapCache(page);
> +#endif
> +}

The above CONFIG_ARM magic really has no place in drivers/block.

> +static int add_backing_swap_extent(struct ramzswap *rzs,
> +                               pgoff_t phy_pagenum,
> +                               pgoff_t num_pages)
> +{
> +       unsigned int idx;
> +       struct list_head *head;
> +       struct page *curr_page, *new_page;
> +       unsigned int extents_per_page = PAGE_SIZE /
> +                               sizeof(struct ramzswap_backing_extent);
> +
> +       idx = rzs->num_extents % extents_per_page;
> +       if (!idx) {
> +               new_page = alloc_page(__GFP_ZERO);
> +               if (!new_page)
> +                       return -ENOMEM;
> +
> +               if (rzs->num_extents) {
> +                       curr_page = virt_to_page(rzs->curr_extent);
> +                       head = &curr_page->lru;
> +               } else {
> +                       head = &rzs->backing_swap_extent_list;
> +               }
> +
> +               list_add(&new_page->lru, head);
> +               rzs->curr_extent = page_address(new_page);
> +       }
> +
> +       rzs->curr_extent->phy_pagenum = phy_pagenum;
> +       rzs->curr_extent->num_pages = num_pages;
> +
> +       pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
> +               phy_pagenum + num_pages - 1, num_pages);

What's this "C" thing everywhere? A subsystem prefix? Shouldn't you
override pr_fmt() instead?

> +static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
> +{
> +       int ret, fwd_write_request = 0;
> +       u32 offset;
> +       size_t clen;
> +       pgoff_t index;
> +       struct zobj_header *zheader;
> +       struct page *page, *page_store;
> +       unsigned char *user_mem, *cmem, *src;
> +
> +       stat_inc(rzs->stats.num_writes);
> +
> +       page = bio->bi_io_vec[0].bv_page;
> +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
> +
> +       src = rzs->compress_buffer;
> +
> +       /*
> +        * System swaps to same sector again when the stored page
> +        * is no longer referenced by any process. So, its now safe
> +        * to free the memory that was allocated for this page.
> +        */
> +       if (rzs->table[index].page)
> +               ramzswap_free_page(rzs, index);
> +
> +       /*
> +        * No memory ia allocated for zero filled pages.
> +        * Simply clear zero page flag.
> +        */
> +       if (test_flag(rzs, index, RZS_ZERO)) {
> +               stat_dec(rzs->stats.pages_zero);
> +               clear_flag(rzs, index, RZS_ZERO);
> +       }
> +
> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> +       mutex_lock(&rzs->lock);
> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");

Hmm? What's this? I don't think you should be doing ad hoc
trace_mark() in driver code.

> +
> +       user_mem = get_ptr_atomic(page, 0, KM_USER0);
> +       if (page_zero_filled(user_mem)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               stat_inc(rzs->stats.pages_zero);
> +               set_flag(rzs, index, RZS_ZERO);
> +
> +               set_bit(BIO_UPTODATE, &bio->bi_flags);
> +               bio_endio(bio, 0);
> +               return 0;
> +       }
> +
> +       if (rzs->backing_swap &&
> +               (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +       ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
> +                               rzs->compress_workmem);
> +
> +       put_ptr_atomic(user_mem, KM_USER0);
> +
> +       if (unlikely(ret != LZO_E_OK)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_err(C "Compression failed! err=%d\n", ret);
> +               stat_inc(rzs->stats.failed_writes);
> +               goto out;
> +       }
> +
> +       /*
> +        * Page is incompressible. Forward it to backing swap
> +        * if present. Otherwise, store it as-is (uncompressed)
> +        * since we do not want to return too many swap write
> +        * errors which has side effect of hanging the system.
> +        */
> +       if (unlikely(clen > MAX_CPAGE_SIZE)) {
> +               if (rzs->backing_swap) {
> +                       mutex_unlock(&rzs->lock);
> +                       fwd_write_request = 1;
> +                       goto out;
> +               }
> +
> +               clen = PAGE_SIZE;
> +               page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
> +               if (unlikely(!page_store)) {
> +                       mutex_unlock(&rzs->lock);
> +                       pr_info(C "Error allocating memory for incompressible "
> +                               "page: %lu\n", index);
> +                       stat_inc(rzs->stats.failed_writes);
> +                       goto out;
> +               }
> +
> +               offset = 0;
> +               set_flag(rzs, index, RZS_UNCOMPRESSED);
> +               stat_inc(rzs->stats.pages_expand);
> +               rzs->table[index].page = page_store;
> +               src = get_ptr_atomic(page, 0, KM_USER0);
> +               goto memstore;
> +       }
> +
> +       if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
> +                       &rzs->table[index].page, &offset,
> +                       GFP_NOIO | __GFP_HIGHMEM)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_info(C "Error allocating memory for compressed "
> +                       "page: %lu, size=%zu\n", index, clen);
> +               stat_inc(rzs->stats.failed_writes);
> +               if (rzs->backing_swap)
> +                       fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +memstore:
> +       rzs->table[index].offset = offset;
> +
> +       cmem = get_ptr_atomic(rzs->table[index].page,
> +                       rzs->table[index].offset, KM_USER1);
> +
> +#if 0
> +       /* Back-reference needed for memory defragmentation */
> +       if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
> +               zheader = (struct zobj_header *)cmem;
> +               zheader->table_idx = index;
> +               cmem += sizeof(*zheader);
> +       }
> +#endif

Drop the above dead code?

> +
> +       memcpy(cmem, src, clen);
> +
> +       put_ptr_atomic(cmem, KM_USER1);
> +       if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
> +               put_ptr_atomic(src, KM_USER0);
> +
> +       /* Update stats */
> +       rzs->stats.compr_size += clen;
> +       stat_inc(rzs->stats.pages_stored);
> +       stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
> +
> +       mutex_unlock(&rzs->lock);
> +
> +       set_bit(BIO_UPTODATE, &bio->bi_flags);
> +       bio_endio(bio, 0);
> +       return 0;
> +
> +out:
> +       if (fwd_write_request) {
> +               stat_inc(rzs->stats.bdev_num_writes);
> +               bio->bi_bdev = rzs->backing_swap;
> +#if 0
> +               /*
> +                * TODO: We currently have linear mapping of ramzswap and
> +                * backing swap sectors. This is not desired since we want
> +                * to optimize writes to backing swap to minimize disk seeks
> +                * or have effective wear leveling (for SSDs). Also, a
> +                * non-linear mapping is required to implement compressed
> +                * on-disk swapping.
> +                */
> +                bio->bi_sector = get_backing_swap_page()
> +                                       << SECTORS_PER_PAGE_SHIFT;
> +#endif

This too?

> +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
> +{
> +       int ret;
> +       size_t num_pages, totalram_bytes;
> +       struct sysinfo i;
> +       struct page *page;
> +       union swap_header *swap_header;
> +
> +       if (rzs->init_done) {
> +               pr_info(C "Device already initialized!\n");
> +               return -EBUSY;
> +       }
> +
> +       ret = setup_backing_swap(rzs);
> +       if (ret)
> +               goto fail;
> +
> +       si_meminfo(&i);
> +       /* Here is a trivia: guess unit used for i.totalram !! */
> +       totalram_bytes = i.totalram << PAGE_SHIFT;

You can use totalram_pages here. OTOH, I'm not sure why the driver
needs this information. Hmm?

> +
> +       if (rzs->backing_swap)
> +               ramzswap_set_memlimit(rzs, totalram_bytes);
> +       else
> +               ramzswap_set_disksize(rzs, totalram_bytes);
> +
> +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
> +       if (rzs->compress_workmem == NULL) {
> +               pr_err(C "Error allocating compressor working memory!\n");
> +               ret = -ENOMEM;
> +               goto fail;
> +       }
> +
> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);

Use alloc_pages(__GFP_ZERO) here?

> diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
> new file mode 100644
> index 0000000..7f77edc
> --- /dev/null
> +++ b/drivers/block/ramzswap/ramzswap_drv.h

> +
> +#define SECTOR_SHIFT           9
> +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)

Don't we have these defines somewhere in include/linux?

> +
> +/* Message prefix */
> +#define C "ramzswap: "

Use pr_fmt() instead.

> +
> +/* Debugging and Stats */
> +#define NOP    do { } while (0)

Huh? Drop this.

> +
> +#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
> +#define STATS
> +#endif

Why can't you rename that to CONFIG_RAMZSWAP_STATS and use that
instead of the very generic STATS?

> +
> +#if defined(STATS)
> +#define stat_inc(stat)                 ((stat)++)
> +#define stat_dec(stat)                 ((stat)--)
> +#define stat_inc_if_less(stat, val1, val2) \
> +                               ((stat) += ((val1) < (val2) ? 1 : 0))
> +#define stat_dec_if_less(stat, val1, val2) \
> +                               ((stat) -= ((val1) < (val2) ? 1 : 0))
> +#else  /* STATS */
> +#define stat_inc(x)                    NOP
> +#define stat_dec(x)                    NOP
> +#define stat_inc_if_less(x, v1, v2)    NOP
> +#define stat_dec_if_less(x, v1, v2)    NOP
> +#endif /* STATS */

Why do you need inc_if_less() and dec_if_less()? And why are these not
static inlines?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/