Re: ata: BUG in ata_sff_hsm_move

From: Dmitry Vyukov
Date: Mon Feb 01 2016 - 05:46:50 EST


On Fri, Jan 29, 2016 at 9:23 PM, Tejun Heo <tj@xxxxxxxxxx> wrote:
> Hello,
>
> On Fri, Jan 29, 2016 at 02:40:33PM +0100, Dmitry Vyukov wrote:
>> It now deadlocks at this stack. It is probably not OK to call
>> ata_sff_hsm_move holding the spinlock.
>
> Yeah, the locking is pretty messed up with the polling path. Can you
> please try the following?


No crashes over the weekend. We can consider this fixed.
Thanks!



> diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
> index cdf6215..7dbba38 100644
> --- a/drivers/ata/libata-sff.c
> +++ b/drivers/ata/libata-sff.c
> @@ -997,12 +997,9 @@ static inline int ata_hsm_ok_in_wq(struct ata_port *ap,
> static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
> {
> struct ata_port *ap = qc->ap;
> - unsigned long flags;
>
> if (ap->ops->error_handler) {
> if (in_wq) {
> - spin_lock_irqsave(ap->lock, flags);
> -
> /* EH might have kicked in while host lock is
> * released.
> */
> @@ -1014,8 +1011,6 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
> } else
> ata_port_freeze(ap);
> }
> -
> - spin_unlock_irqrestore(ap->lock, flags);
> } else {
> if (likely(!(qc->err_mask & AC_ERR_HSM)))
> ata_qc_complete(qc);
> @@ -1024,10 +1019,8 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
> }
> } else {
> if (in_wq) {
> - spin_lock_irqsave(ap->lock, flags);
> ata_sff_irq_on(ap);
> ata_qc_complete(qc);
> - spin_unlock_irqrestore(ap->lock, flags);
> } else
> ata_qc_complete(qc);
> }
> @@ -1048,9 +1041,10 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
> {
> struct ata_link *link = qc->dev->link;
> struct ata_eh_info *ehi = &link->eh_info;
> - unsigned long flags = 0;
> int poll_next;
>
> + lockdep_assert_held(ap->lock);
> +
> WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
>
> /* Make sure ata_sff_qc_issue() does not throw things
> @@ -1112,14 +1106,6 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
> }
> }
>
> - /* Send the CDB (atapi) or the first data block (ata pio out).
> - * During the state transition, interrupt handler shouldn't
> - * be invoked before the data transfer is complete and
> - * hsm_task_state is changed. Hence, the following locking.
> - */
> - if (in_wq)
> - spin_lock_irqsave(ap->lock, flags);
> -
> if (qc->tf.protocol == ATA_PROT_PIO) {
> /* PIO data out protocol.
> * send first data block.
> @@ -1135,9 +1121,6 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
> /* send CDB */
> atapi_send_cdb(ap, qc);
>
> - if (in_wq)
> - spin_unlock_irqrestore(ap->lock, flags);
> -
> /* if polling, ata_sff_pio_task() handles the rest.
> * otherwise, interrupt handler takes over from here.
> */
> @@ -1361,12 +1344,14 @@ static void ata_sff_pio_task(struct work_struct *work)
> u8 status;
> int poll_next;
>
> + spin_lock_irq(ap->lock);
> +
> BUG_ON(ap->sff_pio_task_link == NULL);
> /* qc can be NULL if timeout occurred */
> qc = ata_qc_from_tag(ap, link->active_tag);
> if (!qc) {
> ap->sff_pio_task_link = NULL;
> - return;
> + goto out_unlock;
> }
>
> fsm_start:
> @@ -1381,11 +1366,14 @@ static void ata_sff_pio_task(struct work_struct *work)
> */
> status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
> if (status & ATA_BUSY) {
> + spin_unlock_irq(ap->lock);
> ata_msleep(ap, 2);
> + spin_lock_irq(ap->lock);
> +
> status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
> if (status & ATA_BUSY) {
> ata_sff_queue_pio_task(link, ATA_SHORT_PAUSE);
> - return;
> + goto out_unlock;
> }
> }
>
> @@ -1402,6 +1390,8 @@ static void ata_sff_pio_task(struct work_struct *work)
> */
> if (poll_next)
> goto fsm_start;
> +out_unlock:
> + spin_unlock_irq(ap->lock);
> }
>
> /**