[PATCHES] improve sysfs performance, make it O(log n)

From: Mikulas Patocka
Date: Wed Jul 20 2011 - 18:57:05 EST


Hi

Here are 5 patches to make sysfs O(log n) and improve performance on
operations with many block devices.

Patch 1 changes synchronize_rcu() to synchronize_rcu_expedited(), it
causes massive speedup when deleting many block devices.

Patch 2 fixes inefficient i_nlink calculation in sysfs.

Patch 3 makes name lookups use rb-tree.

Patch 4 removes s_sibling hack (it has no effect on performance).

Patch 5 makes inode number lookups use rb-tree --- thus making all types
of sysfs directory operations O(log n).


Performance testing --- 10000 dm devices created with dmsetup:

The numbers are time in seconds to perform these operations:
1. create 10000 devices
2. ls -la /sys/block
3. ls -al /sys/block after a cache flush (echo 3 >/proc/sys/vm/drop_caches)
4. dmsetup remove_all

no patches 109 9.8 12 1624
patch 1 109 9.1 12 15
patches 1,2 120 0.19 3.0 16
patches 1,2,3 68 0.17 0.47 12
patches 1,2,3,4 68 0.15 0.48 11
all patches 37 0.16 0.46 3.3

Mikulasbacking-dev: use synchronize_rcu_expedited instead of synchronize_rcu

synchronize_rcu sleeps several timer ticks.
When removing thousands block devices, this sleeping
causes a severe delay.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
mm/backing-dev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-3.0-rc7-fast/mm/backing-dev.c
===================================================================
--- linux-3.0-rc7-fast.orig/mm/backing-dev.c 2011-07-19 18:01:00.000000000 +0200
+++ linux-3.0-rc7-fast/mm/backing-dev.c 2011-07-19 18:01:07.000000000 +0200
@@ -505,7 +505,7 @@ static void bdi_remove_from_list(struct
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);

- synchronize_rcu();
+ synchronize_rcu_expedited();
}

int bdi_register(struct backing_dev_info *bdi, struct device *parent,
sysfs: count subdirectories

This patch introduces a subdirectory counter for each sysfs directory.

Without the patch, sysfs_refresh_inode would walk all entries of the directory
to calculate the number of subdirectories.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
fs/sysfs/dir.c | 6 ++++++
fs/sysfs/inode.c | 14 +-------------
fs/sysfs/sysfs.h | 2 ++
3 files changed, 9 insertions(+), 13 deletions(-)

Index: linux-3.0-rc7-fast/fs/sysfs/dir.c
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/dir.c 2011-07-18 19:43:05.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/dir.c 2011-07-18 19:45:06.000000000 +0200
@@ -47,6 +47,9 @@ static void sysfs_link_sibling(struct sy

BUG_ON(sd->s_sibling);

+ if (sysfs_type(sd) == SYSFS_DIR)
+ parent_sd->s_dir.subdirs++;
+
/* Store directory entries in order by ino. This allows
* readdir to properly restart without having to add a
* cursor into the s_dir.children list.
@@ -73,6 +76,9 @@ static void sysfs_unlink_sibling(struct
{
struct sysfs_dirent **pos;

+ if (sysfs_type(sd) == SYSFS_DIR)
+ sd->s_parent->s_dir.subdirs--;
+
for (pos = &sd->s_parent->s_dir.children; *pos;
pos = &(*pos)->s_sibling) {
if (*pos == sd) {
Index: linux-3.0-rc7-fast/fs/sysfs/inode.c
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/inode.c 2011-07-18 19:43:33.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/inode.c 2011-07-18 19:45:50.000000000 +0200
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct
inode->i_ctime = iattr->ia_ctime;
}

-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
- struct sysfs_dirent *child;
- int nr = 0;
-
- for (child = sd->s_dir.children; child; child = child->s_sibling)
- if (sysfs_type(child) == SYSFS_DIR)
- nr++;
-
- return nr + 2;
-}
-
static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct s
}

if (sysfs_type(sd) == SYSFS_DIR)
- inode->i_nlink = sysfs_count_nlink(sd);
+ inode->i_nlink = sd->s_dir.subdirs + 2;
}

int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
Index: linux-3.0-rc7-fast/fs/sysfs/sysfs.h
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/sysfs.h 2011-07-18 19:42:42.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/sysfs.h 2011-07-18 19:44:28.000000000 +0200
@@ -19,6 +19,8 @@ struct sysfs_elem_dir {
struct kobject *kobj;
/* children list starts here and goes through sd->s_sibling */
struct sysfs_dirent *children;
+
+ unsigned long subdirs;
};

struct sysfs_elem_symlink {
sysfs: use rb-tree for name lookups

Use red-black tree for name lookups.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
fs/sysfs/dir.c | 45 +++++++++++++++++++++++++++++++++++++++------
fs/sysfs/sysfs.h | 5 +++++
2 files changed, 44 insertions(+), 6 deletions(-)

Index: linux-3.0-rc7-fast/fs/sysfs/sysfs.h
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/sysfs.h 2011-07-18 20:06:24.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/sysfs.h 2011-07-18 20:16:32.000000000 +0200
@@ -11,6 +11,7 @@
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/fs.h>
+#include <linux/rbtree.h>

struct sysfs_open_dirent;

@@ -21,6 +22,8 @@ struct sysfs_elem_dir {
struct sysfs_dirent *children;

unsigned long subdirs;
+
+ struct rb_root name_tree;
};

struct sysfs_elem_symlink {
@@ -61,6 +64,8 @@ struct sysfs_dirent {
struct sysfs_dirent *s_sibling;
const char *s_name;

+ struct rb_node name_node;
+
const void *s_ns; /* namespace tag */
union {
struct sysfs_elem_dir s_dir;
Index: linux-3.0-rc7-fast/fs/sysfs/dir.c
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/dir.c 2011-07-18 20:06:24.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/dir.c 2011-07-18 20:16:02.000000000 +0200
@@ -45,6 +45,9 @@ static void sysfs_link_sibling(struct sy
struct sysfs_dirent *parent_sd = sd->s_parent;
struct sysfs_dirent **pos;

+ struct rb_node **p;
+ struct rb_node *parent;
+
BUG_ON(sd->s_sibling);

if (sysfs_type(sd) == SYSFS_DIR)
@@ -60,6 +63,26 @@ static void sysfs_link_sibling(struct sy
}
sd->s_sibling = *pos;
*pos = sd;
+
+ p = &parent_sd->s_dir.name_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ int c;
+ parent = *p;
+#define node rb_entry(parent, struct sysfs_dirent, name_node)
+ c = strcmp(sd->s_name, node->s_name);
+ if (c < 0) {
+ p = &node->name_node.rb_left;
+ } else if (c > 0) {
+ p = &node->name_node.rb_right;
+ } else {
+ printk(KERN_CRIT "sysfs: inserting duplicate filename '%s'\n", sd->s_name);
+ BUG();
+ }
+#undef node
+ }
+ rb_link_node(&sd->name_node, parent, p);
+ rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
}

/**
@@ -87,6 +110,8 @@ static void sysfs_unlink_sibling(struct
break;
}
}
+
+ rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
}

/**
@@ -546,14 +571,22 @@ struct sysfs_dirent *sysfs_find_dirent(s
const void *ns,
const unsigned char *name)
{
- struct sysfs_dirent *sd;
+ struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;

- for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
- if (ns && sd->s_ns && (sd->s_ns != ns))
- continue;
- if (!strcmp(sd->s_name, name))
- return sd;
+ while (p) {
+ int c;
+#define node rb_entry(p, struct sysfs_dirent, name_node)
+ c = strcmp(name, node->s_name);
+ if (c < 0) {
+ p = node->name_node.rb_left;
+ } else if (c > 0) {
+ p = node->name_node.rb_right;
+ } else {
+ return node;
+ }
+#undef node
}
+
return NULL;
}

sysfs: remove s_sibling hacks

s_sibling was used for three different purposes:
1) as a linked list of entries in the directory
2) as a linked list of entries to be deleted
3) as a pointer to "struct completion"

This patch removes the hack and introduces new union u which
holds pointers for cases 2) and 3).

This change is needed for the following patch that removes s_sibling at all
and replaces it with a rb tree.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
fs/sysfs/dir.c | 19 +++++++------------
fs/sysfs/sysfs.h | 5 +++++
2 files changed, 12 insertions(+), 12 deletions(-)

Index: linux-3.0-rc7-fast/fs/sysfs/dir.c
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/dir.c 2011-07-20 20:42:25.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/dir.c 2011-07-20 20:55:15.000000000 +0200
@@ -157,7 +157,6 @@ struct sysfs_dirent *sysfs_get_active(st
*/
void sysfs_put_active(struct sysfs_dirent *sd)
{
- struct completion *cmpl;
int v;

if (unlikely(!sd))
@@ -169,10 +168,9 @@ void sysfs_put_active(struct sysfs_diren
return;

/* atomic_dec_return() is a mb(), we'll always see the updated
- * sd->s_sibling.
+ * sd->u.completion.
*/
- cmpl = (void *)sd->s_sibling;
- complete(cmpl);
+ complete(sd->u.completion);
}

/**
@@ -186,16 +184,16 @@ static void sysfs_deactivate(struct sysf
DECLARE_COMPLETION_ONSTACK(wait);
int v;

- BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+ BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));

if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
return;

- sd->s_sibling = (void *)&wait;
+ sd->u.completion = (void *)&wait;

rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
/* atomic_add_return() is a mb(), put_active() will always see
- * the updated sd->s_sibling.
+ * the updated sd->u.completion.
*/
v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);

@@ -204,8 +202,6 @@ static void sysfs_deactivate(struct sysf
wait_for_completion(&wait);
}

- sd->s_sibling = NULL;
-
lock_acquired(&sd->dep_map, _RET_IP_);
rwsem_release(&sd->dep_map, 1, _RET_IP_);
}
@@ -521,7 +517,7 @@ void sysfs_remove_one(struct sysfs_addrm
}

sd->s_flags |= SYSFS_FLAG_REMOVED;
- sd->s_sibling = acxt->removed;
+ sd->u.removed_list = acxt->removed;
acxt->removed = sd;
}

@@ -545,8 +541,7 @@ void sysfs_addrm_finish(struct sysfs_add
while (acxt->removed) {
struct sysfs_dirent *sd = acxt->removed;

- acxt->removed = sd->s_sibling;
- sd->s_sibling = NULL;
+ acxt->removed = sd->u.removed_list;

sysfs_deactivate(sd);
unmap_bin_file(sd);
Index: linux-3.0-rc7-fast/fs/sysfs/sysfs.h
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/sysfs.h 2011-07-20 20:42:12.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/sysfs.h 2011-07-20 20:46:50.000000000 +0200
@@ -66,6 +66,11 @@ struct sysfs_dirent {

struct rb_node name_node;

+ union {
+ struct completion *completion;
+ struct sysfs_dirent *removed_list;
+ } u;
+
const void *s_ns; /* namespace tag */
union {
struct sysfs_elem_dir s_dir;
sysfs: use rb-tree for inode number lookup

This patch makes sysfs use red-black tree for inode number lookup.
Together with a previous patch to use red-black tree for name lookup,
this patch makes all sysfs lookups to have O(log n) complexity.

Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>

---
fs/sysfs/dir.c | 89 ++++++++++++++++++++++++++++++-------------------------
fs/sysfs/sysfs.h | 5 +--
2 files changed, 52 insertions(+), 42 deletions(-)

Index: linux-3.0-rc7-fast/fs/sysfs/dir.c
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/dir.c 2011-07-20 20:55:15.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/dir.c 2011-07-20 23:30:47.000000000 +0200
@@ -43,26 +43,30 @@ static DEFINE_IDA(sysfs_ino_ida);
static void sysfs_link_sibling(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd = sd->s_parent;
- struct sysfs_dirent **pos;

struct rb_node **p;
struct rb_node *parent;

- BUG_ON(sd->s_sibling);
-
if (sysfs_type(sd) == SYSFS_DIR)
parent_sd->s_dir.subdirs++;

- /* Store directory entries in order by ino. This allows
- * readdir to properly restart without having to add a
- * cursor into the s_dir.children list.
- */
- for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
- if (sd->s_ino < (*pos)->s_ino)
- break;
+ p = &parent_sd->s_dir.inode_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+#define node rb_entry(parent, struct sysfs_dirent, inode_node)
+ if (sd->s_ino < node->s_ino) {
+ p = &node->inode_node.rb_left;
+ } else if (sd->s_ino > node->s_ino) {
+ p = &node->inode_node.rb_right;
+ } else {
+ printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", sd->s_ino);
+ BUG();
+ }
+#undef node
}
- sd->s_sibling = *pos;
- *pos = sd;
+ rb_link_node(&sd->inode_node, parent, p);
+ rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);

p = &parent_sd->s_dir.name_tree.rb_node;
parent = NULL;
@@ -97,20 +101,10 @@ static void sysfs_link_sibling(struct sy
*/
static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
{
- struct sysfs_dirent **pos;
-
if (sysfs_type(sd) == SYSFS_DIR)
sd->s_parent->s_dir.subdirs--;

- for (pos = &sd->s_parent->s_dir.children; *pos;
- pos = &(*pos)->s_sibling) {
- if (*pos == sd) {
- *pos = sd->s_sibling;
- sd->s_sibling = NULL;
- break;
- }
- }
-
+ rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
}

@@ -778,21 +772,19 @@ void sysfs_remove_subdir(struct sysfs_di
static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
{
struct sysfs_addrm_cxt acxt;
- struct sysfs_dirent **pos;
+ struct rb_node *pos;

if (!dir_sd)
return;

pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
sysfs_addrm_start(&acxt, dir_sd);
- pos = &dir_sd->s_dir.children;
- while (*pos) {
- struct sysfs_dirent *sd = *pos;
-
+ pos = rb_first(&dir_sd->s_dir.inode_tree);
+ while (pos) {
+ struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+ pos = rb_next(pos);
if (sysfs_type(sd) != SYSFS_DIR)
sysfs_remove_one(&acxt, sd);
- else
- pos = &(*pos)->s_sibling;
}
sysfs_addrm_finish(&acxt);

@@ -915,12 +907,28 @@ static struct sysfs_dirent *sysfs_dir_po
pos = NULL;
}
if (!pos && (ino > 1) && (ino < INT_MAX)) {
- pos = parent_sd->s_dir.children;
- while (pos && (ino > pos->s_ino))
- pos = pos->s_sibling;
+ struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
+ while (p) {
+#define node rb_entry(p, struct sysfs_dirent, inode_node)
+ if (ino < node->s_ino) {
+ pos = node;
+ p = node->inode_node.rb_left;
+ } else if (node->s_ino > ino) {
+ p = node->inode_node.rb_right;
+ } else {
+ pos = node;
+ break;
+ }
+#undef node
+ }
+ }
+ while (pos && pos->s_ns && pos->s_ns != ns) {
+ struct rb_node *p = rb_next(&pos->inode_node);
+ if (!p)
+ pos = NULL;
+ else
+ pos = rb_entry(p, struct sysfs_dirent, inode_node);
}
- while (pos && pos->s_ns && pos->s_ns != ns)
- pos = pos->s_sibling;
return pos;
}

@@ -928,10 +936,13 @@ static struct sysfs_dirent *sysfs_dir_ne
struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
{
pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
- if (pos)
- pos = pos->s_sibling;
- while (pos && pos->s_ns && pos->s_ns != ns)
- pos = pos->s_sibling;
+ if (pos) do {
+ struct rb_node *p = rb_next(&pos->inode_node);
+ if (!p)
+ pos = NULL;
+ else
+ pos = rb_entry(p, struct sysfs_dirent, inode_node);
+ } while (pos && pos->s_ns && pos->s_ns != ns);
return pos;
}

Index: linux-3.0-rc7-fast/fs/sysfs/sysfs.h
===================================================================
--- linux-3.0-rc7-fast.orig/fs/sysfs/sysfs.h 2011-07-20 20:46:50.000000000 +0200
+++ linux-3.0-rc7-fast/fs/sysfs/sysfs.h 2011-07-20 21:00:28.000000000 +0200
@@ -18,11 +18,10 @@ struct sysfs_open_dirent;
/* type-specific structures for sysfs_dirent->s_* union members */
struct sysfs_elem_dir {
struct kobject *kobj;
- /* children list starts here and goes through sd->s_sibling */
- struct sysfs_dirent *children;

unsigned long subdirs;

+ struct rb_root inode_tree;
struct rb_root name_tree;
};

@@ -61,9 +60,9 @@ struct sysfs_dirent {
struct lockdep_map dep_map;
#endif
struct sysfs_dirent *s_parent;
- struct sysfs_dirent *s_sibling;
const char *s_name;

+ struct rb_node inode_node;
struct rb_node name_node;

union {