[PATCH v2] sd: Add timeout_sec and max_retries module parameter for sd

From: Masami Hiramatsu (Google)
Date: Tue Jun 03 2025 - 01:40:52 EST


From: Masami Hiramatsu (Google) <mhiramat@xxxxxxxxxx>

Sometimes a USB storage connection is unstable and the probing
takes longer time than the hung check timeout. Since the probing
runs under device_lock(dev), if there is another task tries to
acquire the same device_lock() (e.g. udevd, in this case), that
task hits the hung_task error and will lead a kernel panic.

For example, enabling CONFIG_DETECT_HUNG_TASK_BLOCKER, I got an
error message something like below (Note that this is 6.1 kernel
example, so the function names are a bit different.);

INFO: task udevd:5301 blocked for more than 122 seconds.
...
INFO: task udevd:5301 is blocked on a mutex likely owned by task kworker/u4:1:11.
task:kworker/u4:1state:D stack:0 pid:11ppid:2 flags:0x00004000
Workqueue: events_unbound async_run_entry_fn
Call Trace:
<TASK>
schedule+0x438/0x1490
? blk_mq_do_dispatch_ctx+0x70/0x1c0
schedule_timeout+0x253/0x790
? try_to_del_timer_sync+0xb0/0xb0
io_schedule_timeout+0x3f/0x80
wait_for_common_io+0xb4/0x160
blk_execute_rq+0x1bd/0x210
__scsi_execute+0x156/0x240
sd_revalidate_disk+0xa2a/0x2360
? kobject_uevent_env+0x158/0x430
sd_probe+0x364/0x47
really_probe+0x15a/0x3b0
__driver_probe_device+0x78/0xc0
driver_probe_device+0x24/0x1a0
__device_attach_driver+0x131/0x160
? coredump_store+0x50/0x50
bus_for_each_drv+0x9d/0xf0
__device_attach_async_helper+0x7e/0xd0 <=== device_lock()
...

In this case, device_lock() was locked in
__device_attach_async_helper(), and it ran driver_probe_device()
for each driver, and eventually send a scsi command which took
very long time.

This is because we use a long timeout and retries for sd_probe().
To avoid it, makes the default timeout and max retries tunable.
Since the sd.ko can be loaded right before the broken device is
probed, pass the default value as module parameters, so that
user can set it via modules.conf.

If we set these values 10 times smaller (e.g. timeout_sec=3),
sd_probe can detect wrong devices/connection before causing
hung_task error.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@xxxxxxxxxx>
---
Changes in v2:
- Add `write_same_timeout_sec` parameter.
---
drivers/scsi/sd.c | 57 +++++++++++++++++++++++++++++++++--------------------
1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 950d8c9fb884..a4ef210aaa80 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -100,6 +100,21 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);

+/* timeout_sec defines the default value of the SCSI command timeout in second. */
+static int sd_timeout_sec = SD_TIMEOUT / HZ;
+module_param_named(timeout_sec, sd_timeout_sec, int, 0644);
+
+/*
+ * write_same_timeout_sec defines the default value of the WRITE SAME SCSI
+ * command timeout in second.
+ */
+static int sd_write_same_timeout_sec = SD_WRITE_SAME_TIMEOUT / HZ;
+module_param_named(write_same_timeout_sec, sd_write_same_timeout_sec, int, 0644);
+
+/* max_retries defines the default value of the max of SCSI command retries.*/
+static int sd_max_retries = SD_MAX_RETRIES;
+module_param_named(max_retries, sd_max_retries, int, 0644);
+
#define SD_MINORS 16

static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim,
@@ -184,7 +199,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
return count;
}

- if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT,
+ if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), sd_timeout_sec * HZ,
sdkp->max_retries, &data, NULL))
return -EINVAL;
len = min_t(size_t, sizeof(buffer), data.length - data.header_length -
@@ -202,7 +217,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
*/
data.device_specific = 0;

- ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT,
+ ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, sd_timeout_sec * HZ,
sdkp->max_retries, &data, &sshdr);
if (ret) {
if (ret > 0 && scsi_sense_valid(&sshdr))
@@ -729,7 +744,7 @@ static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
put_unaligned_be32(len, &cdb[6]);

ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
- buffer, len, SD_TIMEOUT, sdkp->max_retries,
+ buffer, len, sd_timeout_sec * HZ, sdkp->max_retries,
&exec_args);
return ret <= 0 ? ret : -EIO;
}
@@ -930,7 +945,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)

cmd->allowed = sdkp->max_retries;
cmd->transfersize = data_len;
- rq->timeout = SD_TIMEOUT;
+ rq->timeout = sd_timeout_sec * HZ;

return scsi_alloc_sgtables(cmd);
}
@@ -1016,7 +1031,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,

cmd->allowed = sdkp->max_retries;
cmd->transfersize = data_len;
- rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ rq->timeout = unmap ? sd_timeout_sec * HZ : sd_write_same_timeout_sec * HZ;

return scsi_alloc_sgtables(cmd);
}
@@ -1043,7 +1058,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,

cmd->allowed = sdkp->max_retries;
cmd->transfersize = data_len;
- rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ rq->timeout = unmap ? sd_timeout_sec * HZ : sd_write_same_timeout_sec * HZ;

return scsi_alloc_sgtables(cmd);
}
@@ -1739,7 +1754,7 @@ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
if (scsi_block_when_processing_errors(sdp)) {
struct scsi_sense_hdr sshdr = { 0, };

- retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries,
+ retval = scsi_test_unit_ready(sdp, sd_timeout_sec * HZ, sdkp->max_retries,
&sshdr);

/* failed to execute TUR, assume media not present */
@@ -1952,7 +1967,7 @@ static int sd_pr_in_command(struct block_device *bdev, u8 sa,
put_unaligned_be16(data_len, &cmd[7]);

result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len,
- SD_TIMEOUT, sdkp->max_retries, &exec_args);
+ sd_timeout_sec * HZ, sdkp->max_retries, &exec_args);
if (scsi_status_is_check_condition(result) &&
scsi_sense_valid(&sshdr)) {
sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
@@ -2063,7 +2078,7 @@ static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key,
data[20] = flags;

result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data,
- sizeof(data), SD_TIMEOUT, sdkp->max_retries,
+ sizeof(data), sd_timeout_sec * HZ, sdkp->max_retries,
&exec_args);

if (scsi_status_is_check_condition(result) &&
@@ -2435,7 +2450,7 @@ sd_spinup_disk(struct scsi_disk *sdkp)
scsi_failures_reset_retries(&failures);

the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN,
- NULL, 0, SD_TIMEOUT,
+ NULL, 0, sd_timeout_sec * HZ,
sdkp->max_retries, &exec_args);


@@ -2498,7 +2513,7 @@ sd_spinup_disk(struct scsi_disk *sdkp)
sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");
scsi_execute_cmd(sdkp->device, start_cmd,
REQ_OP_DRV_IN, NULL, 0,
- SD_TIMEOUT, sdkp->max_retries,
+ sd_timeout_sec * HZ, sdkp->max_retries,
&exec_args);
spintime_expire = jiffies + 100 * HZ;
spintime = 1;
@@ -2649,7 +2664,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
memset(buffer, 0, RC16_LEN);

the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN,
- buffer, RC16_LEN, SD_TIMEOUT,
+ buffer, RC16_LEN, sd_timeout_sec * HZ,
sdkp->max_retries, &exec_args);
if (the_result > 0) {
if (media_not_present(sdkp, &sshdr))
@@ -2760,7 +2775,7 @@ static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
memset(buffer, 0, 8);

the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer,
- 8, SD_TIMEOUT, sdkp->max_retries,
+ 8, sd_timeout_sec * HZ, sdkp->max_retries,
&exec_args);

if (the_result > 0) {
@@ -2948,7 +2963,7 @@ sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage,
len = 8;

return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len,
- SD_TIMEOUT, sdkp->max_retries, data, sshdr);
+ sd_timeout_sec * HZ, sdkp->max_retries, data, sshdr);
}

/*
@@ -3206,7 +3221,7 @@ static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id)
put_unaligned_be32(sizeof(buf), &cdb[10]);

res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf),
- SD_TIMEOUT, sdkp->max_retries, &exec_args);
+ sd_timeout_sec * HZ, sdkp->max_retries, &exec_args);
if (res < 0)
return false;
if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr))
@@ -3231,7 +3246,7 @@ static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer)
return;

res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a,
- /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT,
+ /*subpage=*/0x05, buffer, SD_BUF_SIZE, sd_timeout_sec * HZ,
sdkp->max_retries, &data, &sshdr);
if (res < 0)
return;
@@ -3274,7 +3289,7 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
if (sdkp->protection_type == 0)
return;

- res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT,
+ res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, sd_timeout_sec * HZ,
sdkp->max_retries, &data, &sshdr);

if (res < 0 || !data.header_length ||
@@ -3682,7 +3697,7 @@ static void sd_read_block_zero(struct scsi_disk *sdkp)
}

scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len,
- SD_TIMEOUT, sdkp->max_retries, NULL);
+ sd_timeout_sec * HZ, sdkp->max_retries, NULL);
kfree(buffer);
}

@@ -3957,13 +3972,13 @@ static int sd_probe(struct device *dev)
sdkp->device = sdp;
sdkp->disk = gd;
sdkp->index = index;
- sdkp->max_retries = SD_MAX_RETRIES;
+ sdkp->max_retries = sd_max_retries;
atomic_set(&sdkp->openers, 0);
atomic_set(&sdkp->device->ioerr_cnt, 0);

if (!sdp->request_queue->rq_timeout) {
if (sdp->type != TYPE_MOD)
- blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
+ blk_queue_rq_timeout(sdp->request_queue, sd_timeout_sec * HZ);
else
blk_queue_rq_timeout(sdp->request_queue,
SD_MOD_TIMEOUT);
@@ -4131,7 +4146,7 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
if (!scsi_device_online(sdp))
return -ENODEV;

- res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT,
+ res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, sd_timeout_sec * HZ,
sdkp->max_retries, &exec_args);
if (res) {
sd_print_result(sdkp, "Start/Stop Unit failed", res);