[PATCH] 3.13 series, sg: O_EXCL and other lock handling

From: Douglas Gilbert
Date: Sat Oct 26 2013 - 17:54:14 EST


The lk 3.12.0-rc series contained four patches against the
sg driver from Vaughan Cao introduced by this post:
[PATCH v7 0/4][SCSI] sg: fix race condition in sg_open
http://marc.info/?l=linux-scsi&m=137774159020002&w=2

Doubt was thrown on the implementation by Madper Xie in
this thread:
[Bug] 12.864681 BUG: lock held when returning to user space!
http://marc.info/?l=linux-kernel&m=138121455020314&w=2

There was a second bug report. A patch from me (that needed to
be revised) was judged to be too late in the lk 3.12 rc cycle
to be accepted. Hence Vaughan's patch is queued to be reverted.
That means the sg driver will remain unchanged for at least
three releases: 3.10, 3.11 and 3.12

The patch presented here targets the lk 3.13 merge window
and is functionally equivalent to Vaughan's four part patch
with my revised patch on top.

Testing:
In the sg3_utils package (version 1.37) examples directory
there are the sg_tst_excl2 and sg_tst_excl3 ** programs.
They use multiple threads to bombard the sg driver (or bsg
or the block layer) with open calls varying the O_EXCL and
O_NONBLOCK flags. Writer threads check for an even number
then do a double increment on a given LBA; reader threads
just read that LBA and check it is even. If the integer
(4 bytes at the beginning of the LB) starts even, then it
should remain even, if O_EXCL is doing its job properly.

** sg_tst_excl3 came after version 1.37 was released and is
in the News section here: http://sg.danny.cz/sg

ChangeLog:
- introduce a finer grain (per device) lock to protect
access and changes to the file descriptor objects
- introduce a semaphore for mutual exclusion of co-incident
open and release calls to the same device
- improve the O_EXCL handling of sg_open() when multiple
callers are waiting for an O_EXCL condition to clear
- change some seq_printf()s to seq_puts()s as requested
by checkpatch.pl
- update copyright notice, version number and date


Signed-off-by: Douglas Gilbert <dgilbert@xxxxxxxxxxxx>


diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index df5e961..90e4631 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -7,9 +7,7 @@
* Original driver (sg.c):
* Copyright (C) 1992 Lawrence Foard
* Version 2 and 3 extensions to driver:
- * Copyright (C) 1998 - 2005 Douglas Gilbert
- *
- * Modified 19-JAN-1998 Richard Gooch <rgooch@xxxxxxxxxxxxx> Devfs support
+ * Copyright (C) 1998 - 2013 Douglas Gilbert
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,8 +16,8 @@
*
*/

-static int sg_version_num = 30534; /* 2 digits for each component */
-#define SG_VERSION_STR "3.5.34"
+static int sg_version_num = 30535; /* 2 digits for each component */
+#define SG_VERSION_STR "3.5.35"

/*
* D. P. Gilbert (dgilbert@xxxxxxxxxxxx, dougg@xxxxxxxxxxxxx), notes:
@@ -64,7 +62,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */

#ifdef CONFIG_SCSI_PROC_FS
#include <linux/proc_fs.h>
-static char *sg_version_date = "20061027";
+static char *sg_version_date = "20131026";

static int sg_proc_init(void);
static void sg_proc_cleanup(void);
@@ -105,11 +103,8 @@ static int scatter_elem_sz_prev = SG_SCATTER_SZ;
static int sg_add(struct device *, struct class_interface *);
static void sg_remove(struct device *, struct class_interface *);

-static DEFINE_SPINLOCK(sg_open_exclusive_lock);
-
static DEFINE_IDR(sg_index_idr);
-static DEFINE_RWLOCK(sg_index_lock); /* Also used to lock
- file descriptor list for device */
+static DEFINE_RWLOCK(sg_index_lock);

static struct class_interface sg_interface = {
.add_dev = sg_add,
@@ -146,8 +141,7 @@ typedef struct sg_request { /* SG_MAX_QUEUE requests outstanding per file */
} Sg_request;

typedef struct sg_fd { /* holds the state of a file descriptor */
- /* sfd_siblings is protected by sg_index_lock */
- struct list_head sfd_siblings;
+ struct list_head sfd_siblings; /* protected by sfd_lock of device */
struct sg_device *parentdp; /* owning device */
wait_queue_head_t read_wait; /* queue read until command done */
rwlock_t rq_list_lock; /* protect access to list in req_arr */
@@ -170,14 +164,14 @@ typedef struct sg_fd { /* holds the state of a file descriptor */

typedef struct sg_device { /* holds the state of each scsi generic device */
struct scsi_device *device;
- wait_queue_head_t o_excl_wait; /* queue open() when O_EXCL in use */
int sg_tablesize; /* adapter's max scatter-gather table size */
u32 index; /* device index number */
- /* sfds is protected by sg_index_lock */
+ spinlock_t sfd_lock; /* protect file descriptor list for device */
struct list_head sfds;
+ struct semaphore or_sem; /* protect co-incidental opens and releases */
+ wait_queue_head_t open_wait; /* waits associated with O_EXCL */
volatile char detached; /* 0->attached, 1->detached pending removal */
- /* exclude protected by sg_open_exclusive_lock */
- char exclude; /* opened for exclusive access */
+ char exclude; /* open(O_EXCL) on this device has succeeded */
char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */
struct gendisk *disk;
struct cdev * cdev; /* char_dev [sysfs: /sys/cdev/major/sg<n>] */
@@ -225,38 +219,28 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
}

-static int get_exclude(Sg_device *sdp)
+static int sfds_list_empty(Sg_device *sdp)
{
unsigned long flags;
int ret;

- spin_lock_irqsave(&sg_open_exclusive_lock, flags);
- ret = sdp->exclude;
- spin_unlock_irqrestore(&sg_open_exclusive_lock, flags);
+ spin_lock_irqsave(&sdp->sfd_lock, flags);
+ ret = list_empty(&sdp->sfds);
+ spin_unlock_irqrestore(&sdp->sfd_lock, flags);
return ret;
}

-static int set_exclude(Sg_device *sdp, char val)
+static int
+open_wait_event(Sg_device *sdp, int excl_case)
{
- unsigned long flags;
+ int retval;

- spin_lock_irqsave(&sg_open_exclusive_lock, flags);
- sdp->exclude = val;
- spin_unlock_irqrestore(&sg_open_exclusive_lock, flags);
- return val;
-}
-
-static int sfds_list_empty(Sg_device *sdp)
-{
- unsigned long flags;
- int ret;
-
- read_lock_irqsave(&sg_index_lock, flags);
- ret = list_empty(&sdp->sfds);
- read_unlock_irqrestore(&sg_index_lock, flags);
- return ret;
+ retval = wait_event_interruptible(sdp->open_wait, (sdp->detached ||
+ (excl_case ? (!sdp->exclude) : sfds_list_empty(sdp))));
+ return sdp->detached ? -ENODEV : retval;
}

+/* Returns 0 on success, else a negated errno value */
static int
sg_open(struct inode *inode, struct file *filp)
{
@@ -265,8 +249,7 @@ sg_open(struct inode *inode, struct file *filp)
struct request_queue *q;
Sg_device *sdp;
Sg_fd *sfp;
- int res;
- int retval;
+ int alone, retval;

nonseekable_open(inode, filp);
SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
@@ -287,61 +270,77 @@ sg_open(struct inode *inode, struct file *filp)
if (retval)
goto sdp_put;

+ /* scsi_block_when_processing_errors() may block so bypass
+ * check if O_NONBLOCK. Permits SCSI commands to be issued
+ * during error recovery. Tread carefully. */
if (!((flags & O_NONBLOCK) ||
scsi_block_when_processing_errors(sdp->device))) {
retval = -ENXIO;
/* we are in error recovery for this device */
- goto error_out;
+ goto error_after_sem;
}

- if (flags & O_EXCL) {
- if (O_RDONLY == (flags & O_ACCMODE)) {
- retval = -EPERM; /* Can't lock it with read only access */
- goto error_out;
- }
- if (!sfds_list_empty(sdp) && (flags & O_NONBLOCK)) {
- retval = -EBUSY;
- goto error_out;
- }
- res = wait_event_interruptible(sdp->o_excl_wait,
- ((!sfds_list_empty(sdp) || get_exclude(sdp)) ? 0 : set_exclude(sdp, 1)));
- if (res) {
- retval = res; /* -ERESTARTSYS because signal hit process */
- goto error_out;
- }
- } else if (get_exclude(sdp)) { /* some other fd has an exclusive lock on dev */
- if (flags & O_NONBLOCK) {
- retval = -EBUSY;
- goto error_out;
+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
+ if ((flags & O_EXCL) && (O_RDONLY == (flags & O_ACCMODE))) {
+ retval = -EPERM; /* Don't allow O_EXCL with read only access */
+ goto error_out;
+ }
+ if (flags & O_NONBLOCK) {
+ if (flags & O_EXCL) {
+ if (!alone) {
+ retval = -EBUSY;
+ goto error_out;
+ }
+ } else {
+ if (sdp->exclude) {
+ retval = -EBUSY;
+ goto error_out;
+ }
}
- res = wait_event_interruptible(sdp->o_excl_wait, !get_exclude(sdp));
- if (res) {
- retval = res; /* -ERESTARTSYS because signal hit process */
- goto error_out;
+ } else {
+ if (flags & O_EXCL) {
+ while (!alone) {
+ up(&sdp->or_sem);
+ retval = open_wait_event(sdp, 0);
+ if (retval) /* -ERESTARTSYS or -ENODEV */
+ goto error_after_sem;
+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
+ }
+ } else {
+ while (sdp->exclude) {
+ up(&sdp->or_sem);
+ retval = open_wait_event(sdp, 1);
+ if (retval) /* -ERESTARTSYS or -ENODEV */
+ goto error_after_sem;
+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
+ }
}
}
- if (sdp->detached) {
- retval = -ENODEV;
- goto error_out;
- }
- if (sfds_list_empty(sdp)) { /* no existing opens on this device */
+ if (flags & O_EXCL)
+ sdp->exclude = 1;
+
+ if (alone) { /* no existing opens on this device */
sdp->sgdebug = 0;
q = sdp->device->request_queue;
sdp->sg_tablesize = queue_max_segments(q);
}
- if ((sfp = sg_add_sfp(sdp, dev)))
+ sfp = sg_add_sfp(sdp, dev);
+ if (!IS_ERR(sfp)) {
filp->private_data = sfp;
- else {
+ retval = 0;
+ up(&sdp->or_sem);
+ } else {
+ retval = PTR_ERR(sfp);
if (flags & O_EXCL) {
- set_exclude(sdp, 0); /* undo if error */
- wake_up_interruptible(&sdp->o_excl_wait);
+ sdp->exclude = 0; /* undo if error */
+ wake_up_interruptible(&sdp->open_wait);
}
- retval = -ENOMEM;
- goto error_out;
- }
- retval = 0;
error_out:
- if (retval) {
+ up(&sdp->or_sem);
+error_after_sem:
scsi_autopm_put_device(sdp->device);
sdp_put:
scsi_device_put(sdp->device);
@@ -352,22 +351,29 @@ sg_put:
return retval;
}

-/* Following function was formerly called 'sg_close' */
+/* Release resources associated with a successful sg_open()
+ * Returns 0 on success, else a negated errno value */
static int
sg_release(struct inode *inode, struct file *filp)
{
Sg_device *sdp;
Sg_fd *sfp;
+ int excl;

if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
return -ENXIO;
+ down(&sdp->or_sem);
SCSI_LOG_TIMEOUT(3, printk("sg_release: %s\n", sdp->disk->disk_name));

- set_exclude(sdp, 0);
- wake_up_interruptible(&sdp->o_excl_wait);
+ excl = sdp->exclude;
+ if (excl)
+ sdp->exclude = 0;

scsi_autopm_put_device(sdp->device);
kref_put(&sfp->f_ref, sg_remove_sfp);
+ if (excl || sfds_list_empty(sdp))
+ wake_up_interruptible(&sdp->open_wait);
+ up(&sdp->or_sem);
return 0;
}

@@ -1415,8 +1421,10 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
disk->first_minor = k;
sdp->disk = disk;
sdp->device = scsidp;
+ spin_lock_init(&sdp->sfd_lock);
INIT_LIST_HEAD(&sdp->sfds);
- init_waitqueue_head(&sdp->o_excl_wait);
+ sema_init(&sdp->or_sem, 1);
+ init_waitqueue_head(&sdp->open_wait);
sdp->sg_tablesize = queue_max_segments(q);
sdp->index = k;
kref_init(&sdp->d_ref);
@@ -1549,11 +1557,13 @@ static void sg_remove(struct device *cl_dev, struct class_interface *cl_intf)

/* Need a write lock to set sdp->detached. */
write_lock_irqsave(&sg_index_lock, iflags);
+ spin_lock(&sdp->sfd_lock);
sdp->detached = 1;
list_for_each_entry(sfp, &sdp->sfds, sfd_siblings) {
wake_up_interruptible(&sfp->read_wait);
kill_fasync(&sfp->async_qp, SIGPOLL, POLL_HUP);
}
+ spin_unlock(&sdp->sfd_lock);
write_unlock_irqrestore(&sg_index_lock, iflags);

sysfs_remove_link(&scsidp->sdev_gendev.kobj, "generic");
@@ -2064,7 +2074,7 @@ sg_add_sfp(Sg_device * sdp, int dev)

sfp = kzalloc(sizeof(*sfp), GFP_ATOMIC | __GFP_NOWARN);
if (!sfp)
- return NULL;
+ return ERR_PTR(-ENOMEM);

init_waitqueue_head(&sfp->read_wait);
rwlock_init(&sfp->rq_list_lock);
@@ -2078,9 +2088,13 @@ sg_add_sfp(Sg_device * sdp, int dev)
sfp->cmd_q = SG_DEF_COMMAND_Q;
sfp->keep_orphan = SG_DEF_KEEP_ORPHAN;
sfp->parentdp = sdp;
- write_lock_irqsave(&sg_index_lock, iflags);
+ spin_lock_irqsave(&sdp->sfd_lock, iflags);
+ if (sdp->detached) {
+ spin_unlock_irqrestore(&sdp->sfd_lock, iflags);
+ return ERR_PTR(-ENODEV);
+ }
list_add_tail(&sfp->sfd_siblings, &sdp->sfds);
- write_unlock_irqrestore(&sg_index_lock, iflags);
+ spin_unlock_irqrestore(&sdp->sfd_lock, iflags);
SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp: sfp=0x%p\n", sfp));
if (unlikely(sg_big_buff != def_reserved_size))
sg_big_buff = def_reserved_size;
@@ -2130,10 +2144,9 @@ static void sg_remove_sfp(struct kref *kref)
struct sg_device *sdp = sfp->parentdp;
unsigned long iflags;

- write_lock_irqsave(&sg_index_lock, iflags);
+ spin_lock_irqsave(&sdp->sfd_lock, iflags);
list_del(&sfp->sfd_siblings);
- write_unlock_irqrestore(&sg_index_lock, iflags);
- wake_up_interruptible(&sdp->o_excl_wait);
+ spin_unlock_irqrestore(&sdp->sfd_lock, iflags);

INIT_WORK(&sfp->ew.work, sg_remove_sfp_usercontext);
schedule_work(&sfp->ew.work);
@@ -2426,8 +2439,7 @@ static int sg_proc_single_open_version(struct inode *inode, struct file *file)

static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v)
{
- seq_printf(s, "host\tchan\tid\tlun\ttype\topens\tqdepth\tbusy\t"
- "online\n");
+ seq_puts(s, "host\tchan\tid\tlun\ttype\topens\tqdepth\tbusy\tonline\n");
return 0;
}

@@ -2492,7 +2504,7 @@ static int sg_proc_seq_show_dev(struct seq_file *s, void *v)
(int) scsidp->device_busy,
(int) scsi_device_online(scsidp));
else
- seq_printf(s, "-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\n");
+ seq_puts(s, "-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\n");
read_unlock_irqrestore(&sg_index_lock, iflags);
return 0;
}
@@ -2515,12 +2527,12 @@ static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v)
seq_printf(s, "%8.8s\t%16.16s\t%4.4s\n",
scsidp->vendor, scsidp->model, scsidp->rev);
else
- seq_printf(s, "<no active device>\n");
+ seq_puts(s, "<no active device>\n");
read_unlock_irqrestore(&sg_index_lock, iflags);
return 0;
}

-/* must be called while holding sg_index_lock */
+/* must be called while holding sg_index_lock and sfd_lock */
static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp)
{
int k, m, new_interface, blen, usg;
@@ -2560,12 +2572,12 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp)
else
cp = " ";
}
- seq_printf(s, cp);
+ seq_puts(s, cp);
blen = srp->data.bufflen;
usg = srp->data.k_use_sg;
- seq_printf(s, srp->done ?
- ((1 == srp->done) ? "rcv:" : "fin:")
- : "act:");
+ seq_puts(s, srp->done ?
+ ((1 == srp->done) ? "rcv:" : "fin:")
+ : "act:");
seq_printf(s, " id=%d blen=%d",
srp->header.pack_id, blen);
if (srp->done)
@@ -2581,7 +2593,7 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp)
(int) srp->data.cmd_opcode);
}
if (0 == m)
- seq_printf(s, " No requests active\n");
+ seq_puts(s, " No requests active\n");
read_unlock(&fp->rq_list_lock);
}
}
@@ -2605,22 +2617,26 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v)

read_lock_irqsave(&sg_index_lock, iflags);
sdp = it ? sg_lookup_dev(it->index) : NULL;
- if (sdp && !list_empty(&sdp->sfds)) {
- struct scsi_device *scsidp = sdp->device;
+ if (sdp) {
+ spin_lock(&sdp->sfd_lock);
+ if (!list_empty(&sdp->sfds)) {
+ struct scsi_device *scsidp = sdp->device;

- seq_printf(s, " >>> device=%s ", sdp->disk->disk_name);
- if (sdp->detached)
- seq_printf(s, "detached pending close ");
- else
- seq_printf
- (s, "scsi%d chan=%d id=%d lun=%d em=%d",
- scsidp->host->host_no,
- scsidp->channel, scsidp->id,
- scsidp->lun,
- scsidp->host->hostt->emulated);
- seq_printf(s, " sg_tablesize=%d excl=%d\n",
- sdp->sg_tablesize, get_exclude(sdp));
- sg_proc_debug_helper(s, sdp);
+ seq_printf(s, " >>> device=%s ", sdp->disk->disk_name);
+ if (sdp->detached)
+ seq_puts(s, "detached pending close ");
+ else
+ seq_printf
+ (s, "scsi%d chan=%d id=%d lun=%d em=%d",
+ scsidp->host->host_no,
+ scsidp->channel, scsidp->id,
+ scsidp->lun,
+ scsidp->host->hostt->emulated);
+ seq_printf(s, " sg_tablesize=%d excl=%d\n",
+ sdp->sg_tablesize, sdp->exclude);
+ sg_proc_debug_helper(s, sdp);
+ }
+ spin_unlock(&sdp->sfd_lock);
}
read_unlock_irqrestore(&sg_index_lock, iflags);
return 0;