[RFC PATCH 1/7] block: Support creating a struct file from a block device

From: Demi Marie Obenour
Date: Wed Jan 25 2023 - 22:34:26 EST


The newly added blkdev_get_file() function allows kernel code to create
a struct file for any block device. The main use-case is for the
struct file to be exposed to userspace as a file descriptor. A future
patch will modify the DM_DEV_CREATE_CREATE ioctl to allow userspace to
get a file descriptor to the newly created block device, avoiding nasty
race conditions.

Signed-off-by: Demi Marie Obenour <demi@xxxxxxxxxxxxxxxxxxxxxx>
---
block/bdev.c | 77 +++++++++++++++++++++++++++++++++++-------
include/linux/blkdev.h | 5 +++
2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index edc110d90df4041e7d337976951bd0d17525f1f7..09cb5ef900ca9ad5b21250bb63e64cc2a79f9289 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -459,10 +459,33 @@ static struct file_system_type bd_type = {
struct super_block *blockdev_superblock __read_mostly;
EXPORT_SYMBOL_GPL(blockdev_superblock);

+static struct vfsmount *bd_mnt __read_mostly;
+
+struct file *
+blkdev_get_file(struct block_device *bdev, fmode_t flags, void *holder)
+{
+ struct inode *inode;
+ struct file *filp;
+ int ret;
+
+ ret = blkdev_do_open(bdev, flags, holder);
+ if (ret)
+ return ERR_PTR(ret);
+ inode = bdev->bd_inode;
+ filp = alloc_file_pseudo(inode, bd_mnt, "[block]", flags | O_CLOEXEC, &def_blk_fops);
+ if (IS_ERR(filp)) {
+ blkdev_put(bdev, flags);
+ } else {
+ filp->f_mapping = inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+ }
+ return filp;
+}
+EXPORT_SYMBOL(blkdev_get_file);
+
void __init bdev_cache_init(void)
{
int err;
- static struct vfsmount *bd_mnt;

bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -775,7 +798,7 @@ void blkdev_put_no_open(struct block_device *bdev)
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
+ * number. Everything else should use blkdev_get_by_path() or blkdev_do_open().
*
* CONTEXT:
* Might sleep.
@@ -785,9 +808,7 @@ void blkdev_put_no_open(struct block_device *bdev)
*/
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{
- bool unblock_events = true;
struct block_device *bdev;
- struct gendisk *disk;
int ret;

ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
@@ -800,18 +821,52 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
bdev = blkdev_get_no_open(dev);
if (!bdev)
return ERR_PTR(-ENXIO);
- disk = bdev->bd_disk;
+
+ ret = blkdev_do_open(bdev, mode, holder);
+ if (ret) {
+ blkdev_put_no_open(bdev);
+ return ERR_PTR(ret);
+ }
+
+ return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
+/**
+ * blkdev_do_open - open a block device by device pointer
+ * @bdev: pointer to the device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the block device pointed to by @bdev. If @mode includes
+ * %FMODE_EXCL, the block device is opened with exclusive access. Specifying
+ * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
+ * the same @holder.
+ *
+ * Unlike blkdev_get_by_dev() and bldev_get_by_path(), this function does not
+ * do any permission checks. The most common use-case is where the device
+ * was freshly created by userspace.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Reference 0 on success, -errno on failure.
+ */
+int blkdev_do_open(struct block_device *bdev, fmode_t mode, void *holder) {
+ struct gendisk *disk = bdev->bd_disk;
+ int ret = -ENXIO;
+ bool unblock_events = true;

if (mode & FMODE_EXCL) {
ret = bd_prepare_to_claim(bdev, holder);
if (ret)
- goto put_blkdev;
+ return ret;
}

disk_block_events(disk);

mutex_lock(&disk->open_mutex);
- ret = -ENXIO;
if (!disk_live(disk))
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
@@ -842,7 +897,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)

if (unblock_events)
disk_unblock_events(disk);
- return bdev;
+ return 0;
put_module:
module_put(disk->fops->owner);
abort_claiming:
@@ -850,11 +905,9 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
bd_abort_claiming(bdev, holder);
mutex_unlock(&disk->open_mutex);
disk_unblock_events(disk);
-put_blkdev:
- blkdev_put_no_open(bdev);
- return ERR_PTR(ret);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get_by_dev);
+EXPORT_SYMBOL(blkdev_do_open);

/**
* blkdev_get_by_path - open a block device by name
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 43d4e073b1115e4628a001081fbf08b296d342df..04635cb5ee29d22394a34c65eb34bea4e7847d8d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -325,6 +325,11 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,

void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);

+struct file *
+blkdev_get_file(struct block_device *bdev, fmode_t flags, void *holder);
+
+int blkdev_do_open(struct block_device *bdev, fmode_t flags, void *holder);
+
#ifdef CONFIG_BLK_DEV_ZONED

#define BLK_ALL_ZONES ((unsigned int)-1)
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab