Re: [PATCH v5 5/7] blktrace: fix debugfs use after free

From: Luis Chamberlain
Date: Tue May 26 2020 - 23:12:10 EST


On Tue, May 19, 2020 at 09:37:13AM -0700, Christoph Hellwig wrote:
> I don't think we need any of that symlink stuff. Even if we want it
> (which I don't), it should not be in a bug fix patch.
>
> In fact to fix the blktrace race I think we only need something like
> this fairly trivial patch (completely untested so far) below.
>
> (and with that we can also drop the previous patch, as blk-debugfs.c
> becomes rather pointless)

You forgot to deal with partitions. Putting similar lipstick on the pig,
this is what I end up with, let me know if this seems agreeable:

diff --git a/block/blk-core.c b/block/blk-core.c
index 2096373bd16d..67cd9ddac822 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,9 +51,7 @@
#include "blk-pm.h"
#include "blk-rq-qos.h"

-#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
-#endif

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -1877,9 +1875,7 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

-#ifdef CONFIG_DEBUG_FS
blk_debugfs_root = debugfs_create_dir("block", NULL);
-#endif

return 0;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 96b7a35c898a..08edc3a54114 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -822,9 +822,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;

- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
-
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);

/*
@@ -855,9 +852,7 @@ void blk_mq_debugfs_register(struct request_queue *q)

void blk_mq_debugfs_unregister(struct request_queue *q)
{
- debugfs_remove_recursive(q->debugfs_dir);
q->sched_debugfs_dir = NULL;
- q->debugfs_dir = NULL;
}

static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 561624d4cc4e..5babb6547f48 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -11,6 +11,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>

#include "blk.h"
#include "blk-mq.h"
@@ -918,6 +919,7 @@ static void blk_release_queue(struct kobject *kobj)

blk_trace_shutdown(q);

+ debugfs_remove_recursive(q->debugfs_dir);
if (queue_is_mq(q))
blk_mq_debugfs_unregister(q);

@@ -989,6 +991,28 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}

+ /*
+ * Blktrace needs a debugsfs name even for queues that don't register
+ * a gendisk, so it lazily registers the debugfs directory. But that
+ * can get us into a situation where a SCSI device is found, with no
+ * driver for it (yet). Then blktrace is used on the device, creating
+ * the debugfs directory, and only after that a drivers is loaded. In
+ * that case we might already have a debugfs directory registered here.
+ * Even worse we could be racing with blktrace to register it.
+ */
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ mutex_lock(&q->blk_trace_mutex);
+ if (!q->debugfs_dir) {
+ q->debugfs_dir =
+ debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
+ }
+ mutex_unlock(&q->blk_trace_mutex);
+#else
+ q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
+#endif
+
if (queue_is_mq(q)) {
__blk_mq_register_dev(dev, q);
blk_mq_debugfs_register(q);
diff --git a/block/blk.h b/block/blk.h
index 5db4ec1e85f7..f11f79295419 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,9 +14,7 @@
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)

-#ifdef CONFIG_DEBUG_FS
extern struct dentry *blk_debugfs_root;
-#endif

struct blk_flush_queue {
unsigned int flush_pending_idx:1;
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 297004fd2264..95f9019aac83 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -10,6 +10,7 @@
#include <linux/vmalloc.h>
#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
+#include <linux/debugfs.h>
#include "check.h"

static int (*check_part[])(struct parsed_partitions *) = {
@@ -322,6 +323,7 @@ void delete_partition(struct gendisk *disk, struct hd_struct *part)
get_device(disk_to_dev(part_to_disk(part)));
rcu_assign_pointer(ptbl->part[part->partno], NULL);
kobject_put(part->holder_dir);
+ debugfs_remove_recursive(part->debugfs_dir);
device_del(part_to_dev(part));

/*
@@ -443,6 +445,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!p->holder_dir)
goto out_del;

+ p->debugfs_dir = debugfs_create_dir(dev_name(pdev), blk_debugfs_root);
dev_set_uevent_suppress(pdev, 0);
if (flags & ADDPART_FLAG_WHOLEDISK) {
err = device_create_file(pdev, &dev_attr_whole_disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20e378b428b8..737467c29a31 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -574,8 +574,8 @@ struct request_queue {
struct list_head tag_set_list;
struct bio_set bio_split;

-#ifdef CONFIG_BLK_DEBUG_FS
struct dentry *debugfs_dir;
+#ifdef CONFIG_BLK_DEBUG_FS
struct dentry *sched_debugfs_dir;
struct dentry *rqos_debugfs_dir;
#endif
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3b6ff5902edc..eb6db276e293 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -22,7 +22,6 @@ struct blk_trace {
u64 end_lba;
u32 pid;
u32 dev;
- struct dentry *dir;
struct dentry *dropped_file;
struct dentry *msg_file;
struct list_head running_list;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a9384449465a..7ff4c4c06140 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -89,6 +89,7 @@ struct hd_struct {
int make_it_fail;
#endif
struct rcu_work rcu_work;
+ struct dentry *debugfs_dir;
};

/**
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea47f2084087..8209d41dec18 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,7 +311,6 @@ static void blk_trace_free(struct blk_trace *bt)
debugfs_remove(bt->msg_file);
debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -482,9 +481,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;

- if (!blk_debugfs_root)
- return -ENOENT;
-
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';

@@ -494,6 +490,38 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');

+ /*
+ * We also have to use a partition directory if a partition is
+ * being worked on, even though the same request_queue is shared.
+ */
+ if (bdev && bdev != bdev->bd_contains)
+ dir = bdev->bd_part->debugfs_dir;
+ else {
+ /*
+ * For queues that do not have a gendisk attached to them, the
+ * debugfs directory will not have been created at setup time.
+ * Create it here lazily, it will only be removed when the
+ * queue is torn down.
+ */
+ if (!q->debugfs_dir) {
+ q->debugfs_dir =
+ debugfs_create_dir(buts->name,
+ blk_debugfs_root);
+ }
+ dir = q->debugfs_dir;
+ }
+
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(q->debugfs_dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ return -ENOENT;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -507,12 +535,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;

- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
- bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
-
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
@@ -551,8 +573,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,

ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;
--
2.26.2