[PATCH] [SCSI] sg: late O_EXCL fix for lk 3.12-rc

From: Douglas Gilbert
Date: Sun Oct 20 2013 - 12:09:39 EST


The lk 3.12.0-rc series contains a series of patches
from Vaughan Cao introduced by this post:
[PATCH v7 0/4][SCSI] sg: fix race condition in sg_open
http://marc.info/?l=linux-scsi&m=137774159020002&w=2

Doubt was thrown on the implementation by Madper Xie in
this thread:
[Bug] 12.864681 BUG: lock held when returning to user space!
http://marc.info/?l=linux-kernel&m=138121455020314&w=2

Well it is not a lock, it is a read-write semaphore down-ed
in sg_open() with the reasonable expectation that a
corresponding sg_release() will arrive thereupon that
read-write semaphore is upp-ed. I'm yet to find kernel
documentation that says that is illegal. There are even driver
examples on the 'net showing this same technique but that
doesn't prove it is correct.

Irrespective of 12.864681 while looking at this I noticed
another bug. The semantics of O_EXCL in the sg driver are
modelled on those of a regular file. That means an attempt
to do sg_open(O_EXCL) on a device with an existing open, or
a sg_open() on a device which already has a sg_open(O_EXCL)
active will wait the caller until "the coast is clear".
This assumes O_NONBLOCK is not given, if it is, EBUSY is sent
back. Now that wait needs to be interruptible and was
before Vaughan Cao's patch. In his defence there are no
down_read_interruptible() and down_write_interruptible()
calls available for this purpose.

So I propose the following patch over Vaughan Cao's patch.
It replaces his per-device read-write semaphore with:
- a normal semaphore (or_sem) to stop co-incident open()s
and release()s on the same device treading on the same
data.
- a wait_queue (open_wait) to handle wait on open() which is
associated with the use of O_EXCL (when O_NONBLOCK is not
given). The two wait_event_interruptible() calls are in a
loop because multiple open()s could be waiting on either
case. All get woken up but only one will get the down() at
the bottom of the loop and proceed to complete the open(),
potentially leaving the other candidates to continue waiting
until it releases.

Apart from the initializations, all my proposed changes are
in sg_open() and sg_release(). The examples directory in
my sg3_utils version 1.37 package:
http://sg.danny.cz/sg/sg3_utils.html
contains sg_tst_excl.cpp and sg_tst_excl2.cpp ** which are
designed to stress O_EXCL on the sg driver and friends (bsg
ignores O_EXCL, and the block layer has questionable O_EXCL
semantics). My patch has been successful with these tests and
others on my meagre hardware.


Given that lk 3.12.0 release is not far away, the safest path
may still be to revert Vaughan Cao's patch. I'll leave that
decision to the maintainers.

Doug Gilbert


** C++11 so gcc 4.8 (or late 4.7) needed

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 5cbc4bb..11c4e94 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -170,7 +170,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
u32 index; /* device index number */
spinlock_t sfd_lock; /* protect file descriptor list for device */
struct list_head sfds;
- struct rw_semaphore o_sem; /* exclude open should hold this rwsem */
+ struct semaphore or_sem; /* protect co-incidental opens and releases */
+ wait_queue_head_t open_wait; /* waits associated with O_EXCL */
volatile char detached; /* 0->attached, 1->detached pending removal */
char exclude; /* opened for exclusive access */
char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */
@@ -232,6 +233,16 @@ static int sfds_list_empty(Sg_device *sdp)
}

static int
+open_wait_event(Sg_device *sdp, int excl_case)
+{
+ int retval;
+
+ retval = wait_event_interruptible(sdp->open_wait, (sdp->detached ||
+ (excl_case ? (! sdp->exclude) : sfds_list_empty(sdp))));
+ return sdp->detached ? -ENODEV : retval;
+}
+
+static int
sg_open(struct inode *inode, struct file *filp)
{
int dev = iminor(inode);
@@ -239,7 +250,7 @@ sg_open(struct inode *inode, struct file *filp)
struct request_queue *q;
Sg_device *sdp;
Sg_fd *sfp;
- int retval;
+ int alone, retval;

nonseekable_open(inode, filp);
SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
@@ -260,6 +271,8 @@ sg_open(struct inode *inode, struct file *filp)
if (retval)
goto sdp_put;

+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
if (!((flags & O_NONBLOCK) ||
scsi_block_when_processing_errors(sdp->device))) {
retval = -ENXIO;
@@ -273,46 +286,63 @@ sg_open(struct inode *inode, struct file *filp)
}
if (flags & O_NONBLOCK) {
if (flags & O_EXCL) {
- if (!down_write_trylock(&sdp->o_sem)) {
+ if (! alone) {
retval = -EBUSY;
goto error_out;
- }
+ }
} else {
- if (!down_read_trylock(&sdp->o_sem)) {
+ if (sdp->exclude) {
retval = -EBUSY;
goto error_out;
}
}
} else {
- if (flags & O_EXCL)
- down_write(&sdp->o_sem);
- else
- down_read(&sdp->o_sem);
+ if (flags & O_EXCL) {
+ while (! alone) {
+ up(&sdp->or_sem);
+ retval = open_wait_event(sdp, 0);
+ if (retval) /* -ERESTARTSYS or -ENODEV */
+ goto error_upped;
+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
+ }
+ } else {
+ while (sdp->exclude) {
+ up(&sdp->or_sem);
+ retval = open_wait_event(sdp, 1);
+ if (retval) /* -ERESTARTSYS or -ENODEV */
+ goto error_upped;
+ down(&sdp->or_sem);
+ alone = sfds_list_empty(sdp);
+ }
+ }
}
/* Since write lock is held, no need to check sfd_list */
if (flags & O_EXCL)
sdp->exclude = 1; /* used by release lock */

- if (sfds_list_empty(sdp)) { /* no existing opens on this device */
+ if (alone) { /* no existing opens on this device */
sdp->sgdebug = 0;
q = sdp->device->request_queue;
sdp->sg_tablesize = queue_max_segments(q);
}
sfp = sg_add_sfp(sdp, dev);
- if (!IS_ERR(sfp))
+ if (!IS_ERR(sfp)) {
filp->private_data = sfp;
/* retval is already provably zero at this point because of the
* check after retval = scsi_autopm_get_device(sdp->device))
*/
- else {
+ up(&sdp->or_sem);
+ } else {
retval = PTR_ERR(sfp);

if (flags & O_EXCL) {
sdp->exclude = 0; /* undo if error */
- up_write(&sdp->o_sem);
- } else
- up_read(&sdp->o_sem);
+ wake_up_interruptible(&sdp->open_wait);
+ }
error_out:
+ up(&sdp->or_sem);
+error_upped:
scsi_autopm_put_device(sdp->device);
sdp_put:
scsi_device_put(sdp->device);
@@ -333,17 +363,18 @@ sg_release(struct inode *inode, struct file *filp)

if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
return -ENXIO;
+ down(&sdp->or_sem);
SCSI_LOG_TIMEOUT(3, printk("sg_release: %s\n", sdp->disk->disk_name));

excl = sdp->exclude;
- sdp->exclude = 0;
if (excl)
- up_write(&sdp->o_sem);
- else
- up_read(&sdp->o_sem);
+ sdp->exclude = 0;

scsi_autopm_put_device(sdp->device);
kref_put(&sfp->f_ref, sg_remove_sfp);
+ if (excl || sfds_list_empty(sdp))
+ wake_up_interruptible(&sdp->open_wait);
+ up(&sdp->or_sem);
return 0;
}

@@ -1393,7 +1424,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
sdp->device = scsidp;
spin_lock_init(&sdp->sfd_lock);
INIT_LIST_HEAD(&sdp->sfds);
- init_rwsem(&sdp->o_sem);
+ sema_init(&sdp->or_sem, 1);
+ init_waitqueue_head(&sdp->open_wait);
sdp->sg_tablesize = queue_max_segments(q);
sdp->index = k;
kref_init(&sdp->d_ref);