Re: [3.6-rc7] switcheroo race with Intel HDA...

From: Takashi Iwai
Date: Wed Oct 10 2012 - 08:34:23 EST


At Tue, 9 Oct 2012 22:26:40 +0800,
Daniel J Blueman wrote:
>
> On 9 October 2012 21:04, Takashi Iwai <tiwai@xxxxxxx> wrote:
> > At Tue, 9 Oct 2012 19:23:56 +0800,
> > Daniel J Blueman wrote:
> >> On 9 October 2012 18:07, Takashi Iwai <tiwai@xxxxxxx> wrote:
> >> > At Tue, 09 Oct 2012 12:04:08 +0200,
> >> > Takashi Iwai wrote:
> >> >> At Tue, 9 Oct 2012 00:34:09 +0800,
> >> >> Daniel J Blueman wrote:
> >> >> > On 8 October 2012 20:58, Takashi Iwai <tiwai@xxxxxxx> wrote:
> >> >> > > At Tue, 25 Sep 2012 13:20:05 +0800,
> >> >> > > Daniel J Blueman wrote:
> >> >> > >> On my Macbook with a discrete Nvidia GPU, there is a race between
> >> >> > >> selecting the integrated GPU and putting the discrete GPU into D3 [1],
> >> >> > >> reliably causing a kernel oops [2].
> >> >> > >>
> >> >> > >> Introducing a delay of ~1s between the calls prevents this. When the
> >> >> > >> second 'OFF' write path executes, it looks like struct azx at
> >> >> > >> card->private_data hasn't yet been allocated yet [3], so there is
> >> >> > >> likely some locking missing.
> >> >> > >
> >> >> > > It's rather pci_get_drvdata() returning NULL (i.e. card is NULL, thus
> >> >> > > card->private_data causes Oops). Could you check the patch like below
> >> >> > > and see whether you get a kernel warning (but no Oops) or the problem
> >> >> > > gets fixed by shifting the assignment of pci drvdata?
> >> >> > [...]
> >> >> >
> >> >> > Good patching. Calling pci_set_drvdata later prevents the oops in HDA,
> >> >> > though we see unexpected 0x0 responses in the response ring buffer
> >> >> > [1], which we don't see when there's a >~1.5s delay between IGD and
> >> >> > OFF.
> >> >>
> >> >> If the previous patch fixed, it means that the switching occurred
> >> >> during the device was being probed. Maybe a better approach to
> >> >> register the VGA switcheroo after the proper initialization.
> >> >>
> >> >> The patch below is a revised one. Please give it a try.
> >> >
> >> > Also, it's not clear which card spews the spurious response.
> >> > Apply the patch below in addition.
> >> [...]
> >>
> >> hda-intel: 0000:01:00.1: spurious response 0x0:0x0, last cmd=0x1f0004
> >> $ lspci -s :1:0.1
> >> 01:00.1 Audio device: NVIDIA Corporation Device 0e1b (rev ff)
> >>
> >> It's the NVIDIA device which presumably hasn't completed it's
> >> transition to D3 at the time the OFF is executed.
> >
> > OK, then could you try the patch below on the top of previous two
> > patches?
>
> The first IGD switcheroo command fails to switch to the integrated GPU:
>
> # cat /sys/kernel/debug/vgaswitcheroo/switch
> 0:DIS:+:Pwr:0000:01:00.0
> 1:IGD: :Pwr:0000:00:02.0
> 2:DIS-Audio: :Pwr:0000:01:00.1
> # echo IGD >/sys/kernel/debug/vgaswitcheroo/switch
> vga_switcheroo: client 1 refused switch
>
> I also instrumented snd_hda_lock_devices, but none of the failure
> paths are being taken, which would leave inconsistent state, as the
> return value isn't checked.

Hm, right, the return value of snd_hda_lock_devices() isn't checked,
but I don't understand how this results like above.
Basically switching is protected by mutex in vga_switcheroo.c, so the
whole operation in the client side should be serialized.

In anyway, try the patch below cleanly, and see the spurious message
error coming up at which timing.


thanks,

Takashi

---
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 6833835..4518b05 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -501,6 +501,7 @@ struct azx {

/* VGA-switcheroo setup */
unsigned int use_vga_switcheroo:1;
+ unsigned int vga_switcheroo_registered:1;
unsigned int init_failed:1; /* delayed init failed */
unsigned int disabled:1; /* disabled by VGA-switcher */

@@ -1597,6 +1598,8 @@ static int DELAYED_INIT_MARK azx_codec_create(struct azx *chip, const char *mode
int c, codecs, err;
int max_slots;

+ printk(KERN_DEBUG "XXX %s: azx_codec_create entered\n",
+ pci_name(chip->pci));
memset(&bus_temp, 0, sizeof(bus_temp));
bus_temp.private_data = chip;
bus_temp.modelname = model;
@@ -1673,6 +1676,8 @@ static int DELAYED_INIT_MARK azx_codec_create(struct azx *chip, const char *mode
snd_printk(KERN_ERR SFX "no codecs initialized\n");
return -ENXIO;
}
+ printk(KERN_DEBUG "XXX %s: azx_codec_create done\n",
+ pci_name(chip->pci));
return 0;
}

@@ -2615,6 +2620,8 @@ static void azx_vs_set_state(struct pci_dev *pci,
return;

disabled = (state == VGA_SWITCHEROO_OFF);
+ printk(KERN_DEBUG "XXX %s: chip->disabled=%d, disabled=%d\n",
+ pci_name(chip->pci), chip->disabled, disabled);
if (chip->disabled == disabled)
return;

@@ -2638,14 +2645,26 @@ static void azx_vs_set_state(struct pci_dev *pci,
disabled ? "Disabling" : "Enabling",
pci_name(chip->pci));
if (disabled) {
+ printk(KERN_DEBUG "XXX %s: azx_suspend...\n",
+ pci_name(chip->pci));
azx_suspend(&pci->dev);
+ printk(KERN_DEBUG "XXX %s: azx_suspend done...\n",
+ pci_name(chip->pci));
chip->disabled = true;
- snd_hda_lock_devices(chip->bus);
+ if (snd_hda_lock_devices(chip->bus))
+ snd_printk(KERN_WARNING SFX
+ "Cannot lock devices!\n");
} else {
snd_hda_unlock_devices(chip->bus);
chip->disabled = false;
+ printk(KERN_DEBUG "XXX %s: azx_resume...\n",
+ pci_name(chip->pci));
azx_resume(&pci->dev);
+ printk(KERN_DEBUG "XXX %s: azx_resume done...\n",
+ pci_name(chip->pci));
}
+ printk(KERN_DEBUG "XXX %s: switching finished: disabled=%d\n",
+ pci_name(chip->pci), chip->disabled);
}
}

@@ -2683,14 +2702,20 @@ static const struct vga_switcheroo_client_ops azx_vs_ops = {

static int __devinit register_vga_switcheroo(struct azx *chip)
{
+ int err;
+
if (!chip->use_vga_switcheroo)
return 0;
/* FIXME: currently only handling DIS controller
* is there any machine with two switchable HDMI audio controllers?
*/
- return vga_switcheroo_register_audio_client(chip->pci, &azx_vs_ops,
+ err = vga_switcheroo_register_audio_client(chip->pci, &azx_vs_ops,
VGA_SWITCHEROO_DIS,
chip->bus != NULL);
+ if (err < 0)
+ return err;
+ chip->vga_switcheroo_registered = 1;
+ return 0;
}
#else
#define init_vga_switcheroo(chip) /* NOP */
@@ -2712,7 +2737,8 @@ static int azx_free(struct azx *chip)
if (use_vga_switcheroo(chip)) {
if (chip->disabled && chip->bus)
snd_hda_unlock_devices(chip->bus);
- vga_switcheroo_unregister_client(chip->pci);
+ if (chip->vga_switcheroo_registered)
+ vga_switcheroo_unregister_client(chip->pci);
}

if (chip->initialized) {
@@ -3062,14 +3088,6 @@ static int __devinit azx_create(struct snd_card *card, struct pci_dev *pci,
}

ok:
- err = register_vga_switcheroo(chip);
- if (err < 0) {
- snd_printk(KERN_ERR SFX
- "Error registering VGA-switcheroo client\n");
- azx_free(chip);
- return err;
- }
-
err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
if (err < 0) {
snd_printk(KERN_ERR SFX "Error creating device [card]!\n");
@@ -3340,6 +3358,13 @@ static int __devinit azx_probe(struct pci_dev *pci,
if (pci_dev_run_wake(pci))
pm_runtime_put_noidle(&pci->dev);

+ err = register_vga_switcheroo(chip);
+ if (err < 0) {
+ snd_printk(KERN_ERR SFX
+ "Error registering VGA-switcheroo client\n");
+ goto out_free;
+ }
+
dev++;
return 0;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/