[RFC] sdhci: timeouts

From: Charles Hannum
Date: Sat May 28 2011 - 19:09:55 EST


Between all the devices with special “quirks” to use
SDHCI_QUIRK_BROKEN_TIMEOUT_VAL and
SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK, and the 19200 Google hits for
“linux sdhci timeout”, it sure seems like there's a problem there
somewhere. Having been bitten by it on my own Dell laptop, I went
poking, and found:

1) The second term of the timeout calculation (based on tacc_clks) is
totally bogus. It's dividing a whole number of SDCLK cycles by the
host clock frequency, but is expecting to get microseconds. See
attached patch sdhci-timeout-clks.diff.

2) The SDHCI spec is very specific that in the presence of both a
Transfer Complete and a Data Timeout Error, that the Transfer Complete
takes precedence. This is documented under the definition of the
Transfer Complete bit (page 53 of the SDHCI 2.0 spec). See attached
patch sdhci-timeout-int.diff.

3) There's a lot of folklore about buggy clocks on various chips, but
no hard data. I always hate this kind of folklore. I found it
helpful to actually measure the timeout and see if it was what we
expected. Ultimately this proved that the controllers in my machines
were in fact delivering timeouts pretty much exactly as expected. See
attached patch sdhci-timeout-log.diff (depends on the previous two
diffs); it outputs message of the form:

May 28 18:53:01 lop-nor kernel: [24056.231401] sdhci: timeout,
requested 508400484ns actual 510006113ns, TMCLK configured 33000
estimated 32896
May 28 18:53:33 lop-nor kernel: [24088.002670] sdhci: timeout,
requested 508400484ns actual 510005815ns, TMCLK configured 33000
estimated 32896
May 28 18:55:29 lop-nor kernel: [24204.139937] sdhci: timeout,
requested 508400484ns actual 500006027ns, TMCLK configured 33000
estimated 33554
May 28 18:55:29 lop-nor kernel: [24204.887654] sdhci: timeout,
requested 508400484ns actual 500006025ns, TMCLK configured 33000
estimated 33554

4) Ultimately I found that some SDHC cards just seem to take a good
long time to respond. I ended up increasing the write timeout to 1s
and I've encountered 0 problems since then. Since this is only an
error condition, and therefore hardly of any performance concerns, I
suggest it may be a good idea to do this in general. See attached
patch sdhci-timeout-limit.diff
--- linux-2.6.38/drivers/mmc/host/sdhci.c 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38/drivers/mmc/host/sdhci.c 2011-05-28 18:49:45.538077065 -0400
@@ -607,7 +607,7 @@

/* timeout in us */
target_timeout = data->timeout_ns / 1000 +
- data->timeout_clks / host->clock;
+ (data->timeout_clks * 1000) / (host->clock / 1000);

if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)
host->timeout_clk = host->clock / 1000;
--- linux-2.6.38/drivers/mmc/host/sdhci.c 2011-05-28 18:49:45.538077065 -0400
+++ linux-2.6.38/drivers/mmc/host/sdhci.c 2011-05-28 18:52:02.799702610 -0400
@@ -1520,7 +1520,8 @@
return;
}

- if (intmask & SDHCI_INT_DATA_TIMEOUT)
+ if ((intmask & SDHCI_INT_DATA_TIMEOUT) &&
+ !(intmask & SDHCI_INT_DATA_END))
host->data->error = -ETIMEDOUT;
else if (intmask & SDHCI_INT_DATA_END_BIT)
host->data->error = -EILSEQ;
--- linux-2.6.38/drivers/mmc/host/sdhci.c 2011-05-28 18:52:02.799702610 -0400
+++ linux-2.6.38/drivers/mmc/host/sdhci.c 2011-05-28 18:46:55.356409956 -0400
@@ -602,15 +602,19 @@
* longer to time out, but that's much better than having a too-short
* timeout value.
*/
+#ifndef CONFIG_SDHCI_STUDY_TIMEOUTS
if (host->quirks & SDHCI_QUIRK_BROKEN_TIMEOUT_VAL)
return 0xE;
+#endif

/* timeout in us */
target_timeout = data->timeout_ns / 1000 +
(data->timeout_clks * 1000) / (host->clock / 1000);

+#ifndef CONFIG_SDHCI_STUDY_TIMEOUTS
if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)
host->timeout_clk = host->clock / 1000;
+#endif

/*
* Figure out needed cycles.
@@ -671,6 +675,14 @@
host->data_early = 0;

count = sdhci_calc_timeout(host, data);
+#ifdef CONFIG_SDHCI_STUDY_TIMEOUTS
+ {
+ struct timespec now = CURRENT_TIME;
+ host->start_time = timespec_to_ns(&now);
+ host->expected_timeout = (u64)(1 << (13+count)) * 1000000 / host->timeout_clk;
+ host->timeout_counter = count;
+ }
+#endif
sdhci_writeb(host, count, SDHCI_TIMEOUT_CONTROL);

if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA))
@@ -1522,7 +1534,17 @@

if ((intmask & SDHCI_INT_DATA_TIMEOUT) &&
!(intmask & SDHCI_INT_DATA_END))
+ {
+#ifdef CONFIG_SDHCI_STUDY_TIMEOUTS
+ struct timespec now = CURRENT_TIME;
+ u64 actual_timeout = timespec_to_ns(&now) - host->start_time;
+ printk(KERN_WARNING "sdhci: timeout, requested %lldns actual %lldns, TMCLK configured %d estimated %lld\n",
+ (long long)host->expected_timeout, (long long)actual_timeout,
+ host->timeout_clk,
+ (long long)((u64)(1 << (13+host->timeout_counter)) * 1000000 / actual_timeout));
+#endif
host->data->error = -ETIMEDOUT;
+ }
else if (intmask & SDHCI_INT_DATA_END_BIT)
host->data->error = -EILSEQ;
else if ((intmask & SDHCI_INT_DATA_CRC) &&
--- linux-2.6.38/include/linux/mmc/sdhci.h 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38/include/linux/mmc/sdhci.h 2011-05-28 18:02:12.632367830 -0400
@@ -17,6 +17,8 @@
#include <linux/io.h>
#include <linux/mmc/host.h>

+#define CONFIG_SDHCI_STUDY_TIMEOUTS
+
struct sdhci_host {
/* Data set by hardware interface driver */
const char *hw_name; /* Hardware bus name */
@@ -145,6 +147,11 @@
unsigned int ocr_avail_sd;
unsigned int ocr_avail_mmc;

+#ifdef CONFIG_SDHCI_STUDY_TIMEOUTS
+ u64 expected_timeout, start_time;
+ u8 timeout_counter;
+#endif
+
unsigned long private[0] ____cacheline_aligned;
};
#endif /* __SDHCI_H */
--- linux-2.6.38/drivers/mmc/core/core.c 2011-03-14 21:20:32.000000000 -0400
+++ linux-2.6.38/drivers/mmc/core/core.c 2011-05-28 17:30:15.287392648 -0400
@@ -308,7 +308,7 @@
* The limit is really 250 ms, but that is
* insufficient for some crappy cards.
*/
- limit_us = 300000;
+ limit_us = 1000000;
else
limit_us = 100000;