[PATCH] w1_therm, reduce race conditions in w1_slave_show

From: David Fries
Date: Sat Mar 07 2015 - 23:25:37 EST


After applying this patch commands such as the following in one
process,

slave=28-000002c95fb1
while true; do echo $slave > /sys/devices/w1_bus_master1/w1_master_add; sleep .1; echo $slave > /sys/devices/w1_bus_master1/w1_master_remove; sleep .1; done

and then two at the same time in two other processes,
slave=28-000002c95fb1
while true; do time cat /sys/devices/w1_bus_master1/$slave/w1_slave ; sleep .1; done

then randomly stop all three and repeat.

With this patch I no longer see crashes, but at best this patch
effectively hiding the result of a race condition. sl->family_data is
being freed and set to NULL in the slave removal while the
w1_slave_show is then dereferencing it, this holds on to the pointer
meaning it's probably clobbering memory now instead of crashing. I
wonder if that would make RCU be a fit for this? The original bug
report was pointing the problem as unlocking bus_mutex while waiting
for the temperature conversion, but I was getting sl->family_data set
to NULL more reliable without external power which means bux_mutex was
held for the duration of w1_slave_show, which is not to say that the
original bug report wasn't correct, it is to say that even with the
spinlock, holding bus_mutex on the slave, isn't sufficient to keep the
slave from being removed.

Reported-By: Thorsten Bschorr <thorsten@xxxxxxxxxx>
---
drivers/w1/slaves/w1_therm.c | 70 +++++++++++++++++++++++++++++++++---------
1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index 1f11a20..403285d 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c
@@ -59,16 +59,32 @@ MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS28EA00));
static int w1_strong_pullup = 1;
module_param_named(strong_pullup, w1_strong_pullup, int, 0);

+struct w1_therm_family_data {
+ uint8_t rom[9];
+ atomic_t refcnt;
+};
+
+/* return the address of the refcnt in the family data */
+#define THERM_REFCNT(family_data) \
+ (&((struct w1_therm_family_data*)family_data)->refcnt)
+
static int w1_therm_add_slave(struct w1_slave *sl)
{
- sl->family_data = kzalloc(9, GFP_KERNEL);
+ sl->family_data = kzalloc(sizeof(struct w1_therm_family_data),
+ GFP_KERNEL);
if (!sl->family_data)
return -ENOMEM;
+ atomic_set(THERM_REFCNT(sl->family_data), 1);
return 0;
}

static void w1_therm_remove_slave(struct w1_slave *sl)
{
+ int refcnt = atomic_sub_return(1, THERM_REFCNT(sl->family_data));
+ while(refcnt) {
+ msleep(1000);
+ refcnt = atomic_read(THERM_REFCNT(sl->family_data));
+ }
kfree(sl->family_data);
sl->family_data = NULL;
}
@@ -194,13 +210,30 @@ static ssize_t w1_slave_show(struct device *device,
struct w1_slave *sl = dev_to_w1_slave(device);
struct w1_master *dev = sl->master;
u8 rom[9], crc, verdict, external_power;
- int i, max_trying = 10;
+ int i, ret, max_trying = 10;
ssize_t c = PAGE_SIZE;
+ u8 *family_data = sl->family_data;
+
+ ret = mutex_lock_interruptible(&dev->bus_mutex);
+ if (ret != 0)
+ goto post_unlock;

- i = mutex_lock_interruptible(&dev->bus_mutex);
- if (i != 0)
- return i;
+ if(!sl->family_data)
+ {
+ ret = -ENODEV;
+ /* Note for anyoe who actually saw this message, it is a known
+ * problem with either slave drivers or this driver in
+ * particular and the request is only a canary indication as
+ * to how many people and how often it is being ran into.
+ */
+ printk(KERN_NOTICE
+ "%s: %u sl->family_data is NULL please report\n",
+ __FILE__, __LINE__);
+ goto pre_unlock;
+ }

+ /* prevent the slave from going away in sleep */
+ atomic_inc(THERM_REFCNT(family_data));
memset(rom, 0, sizeof(rom));

while (max_trying--) {
@@ -230,17 +263,19 @@ static ssize_t w1_slave_show(struct device *device,
mutex_unlock(&dev->bus_mutex);

sleep_rem = msleep_interruptible(tm);
- if (sleep_rem != 0)
- return -EINTR;
+ if (sleep_rem != 0) {
+ ret = -EINTR;
+ goto post_unlock;
+ }

- i = mutex_lock_interruptible(&dev->bus_mutex);
- if (i != 0)
- return i;
+ ret = mutex_lock_interruptible(&dev->bus_mutex);
+ if (ret != 0)
+ goto post_unlock;
} else if (!w1_strong_pullup) {
sleep_rem = msleep_interruptible(tm);
if (sleep_rem != 0) {
- mutex_unlock(&dev->bus_mutex);
- return -EINTR;
+ ret = -EINTR;
+ goto pre_unlock;
}
}

@@ -269,19 +304,24 @@ static ssize_t w1_slave_show(struct device *device,
c -= snprintf(buf + PAGE_SIZE - c, c, ": crc=%02x %s\n",
crc, (verdict) ? "YES" : "NO");
if (verdict)
- memcpy(sl->family_data, rom, sizeof(rom));
+ memcpy(family_data, rom, sizeof(rom));
else
dev_warn(device, "Read failed CRC check\n");

for (i = 0; i < 9; ++i)
c -= snprintf(buf + PAGE_SIZE - c, c, "%02x ",
- ((u8 *)sl->family_data)[i]);
+ ((u8 *)family_data)[i]);

c -= snprintf(buf + PAGE_SIZE - c, c, "t=%d\n",
w1_convert_temp(rom, sl->family->fid));
+ ret = PAGE_SIZE - c;
+
+pre_unlock:
mutex_unlock(&dev->bus_mutex);

- return PAGE_SIZE - c;
+post_unlock:
+ atomic_dec(THERM_REFCNT(family_data));
+ return ret;
}

static int __init w1_therm_init(void)
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/