[PATCH v7 4/5] vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT

From: Abhishek Sahu
Date: Mon Aug 29 2022 - 08:07:22 EST


Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.

1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT

The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.

2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.

There are various cases where the device will not go into the runtime
suspended state. For example,

- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.

For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.

3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.

4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.

5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.

6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.

Signed-off-by: Abhishek Sahu <abhsahu@xxxxxxxxxx>
---
drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++--
include/linux/vfio_pci_core.h | 1 +
2 files changed, 146 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 207ede189c2a..9c612162653f 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -277,11 +277,100 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat
return ret;
}

+static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev)
+{
+ /*
+ * The vdev power related flags are protected with 'memory_lock'
+ * semaphore.
+ */
+ vfio_pci_zap_and_down_write_memory_lock(vdev);
+ if (vdev->pm_runtime_engaged) {
+ up_write(&vdev->memory_lock);
+ return -EINVAL;
+ }
+
+ vdev->pm_runtime_engaged = true;
+ pm_runtime_put_noidle(&vdev->pdev->dev);
+ up_write(&vdev->memory_lock);
+
+ return 0;
+}
+
+static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ struct vfio_pci_core_device *vdev =
+ container_of(device, struct vfio_pci_core_device, vdev);
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
+ if (ret != 1)
+ return ret;
+
+ /*
+ * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
+ * will be decremented. The pm_runtime_put() will be invoked again
+ * while returning from the ioctl and then the device can go into
+ * runtime suspended state.
+ */
+ return vfio_pci_runtime_pm_entry(vdev);
+}
+
+static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
+{
+ if (vdev->pm_runtime_engaged) {
+ vdev->pm_runtime_engaged = false;
+ pm_runtime_get_noresume(&vdev->pdev->dev);
+ }
+}
+
+static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
+{
+ /*
+ * The vdev power related flags are protected with 'memory_lock'
+ * semaphore.
+ */
+ down_write(&vdev->memory_lock);
+ __vfio_pci_runtime_pm_exit(vdev);
+ up_write(&vdev->memory_lock);
+}
+
+static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ struct vfio_pci_core_device *vdev =
+ container_of(device, struct vfio_pci_core_device, vdev);
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
+ if (ret != 1)
+ return ret;
+
+ /*
+ * The device is always in the active state here due to pm wrappers
+ * around ioctls.
+ */
+ vfio_pci_runtime_pm_exit(vdev);
+ return 0;
+}
+
#ifdef CONFIG_PM
static int vfio_pci_core_runtime_suspend(struct device *dev)
{
struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);

+ down_write(&vdev->memory_lock);
+ /*
+ * The user can move the device into D3hot state before invoking
+ * power management IOCTL. Move the device into D0 state here and then
+ * the pci-driver core runtime PM suspend function will move the device
+ * into the low power state. Also, for the devices which have
+ * NoSoftRst-, it will help in restoring the original state
+ * (saved locally in 'vdev->pm_save').
+ */
+ vfio_pci_set_power_state(vdev, PCI_D0);
+ up_write(&vdev->memory_lock);
+
/*
* If INTx is enabled, then mask INTx before going into the runtime
* suspended state and unmask the same in the runtime resume.
@@ -418,6 +507,18 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)

/*
* This function can be invoked while the power state is non-D0.
+ * This non-D0 power state can be with or without runtime PM.
+ * vfio_pci_runtime_pm_exit() will internally increment the usage
+ * count corresponding to pm_runtime_put() called during low power
+ * feature entry and then pm_runtime_resume() will wake up the device,
+ * if the device has already gone into the suspended state. Otherwise,
+ * the vfio_pci_set_power_state() will change the device power state
+ * to D0.
+ */
+ vfio_pci_runtime_pm_exit(vdev);
+ pm_runtime_resume(&pdev->dev);
+
+ /*
* This function calls __pci_reset_function_locked() which internally
* can use pci_pm_reset() for the function reset. pci_pm_reset() will
* fail if the power state is non-D0. Also, for the devices which
@@ -1273,6 +1374,10 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz)
{
switch (flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
+ return vfio_pci_core_pm_entry(device, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
+ return vfio_pci_core_pm_exit(device, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
return vfio_pci_core_feature_token(device, flags, arg, argsz);
default:
@@ -1285,31 +1390,47 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ int ret;

if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
return -EINVAL;

+ ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
+ if (ret) {
+ pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n",
+ ret);
+ return -EIO;
+ }
+
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
- return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+ ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+ break;

case VFIO_PCI_ROM_REGION_INDEX:
if (iswrite)
- return -EINVAL;
- return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
+ ret = -EINVAL;
+ else
+ ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false);
+ break;

case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
- return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+ ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+ break;

case VFIO_PCI_VGA_REGION_INDEX:
- return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
+ ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
+ break;
+
default:
index -= VFIO_PCI_NUM_REGIONS;
- return vdev->region[index].ops->rw(vdev, buf,
+ ret = vdev->region[index].ops->rw(vdev, buf,
count, ppos, iswrite);
+ break;
}

- return -EINVAL;
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
}

ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
@@ -1504,7 +1625,11 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
mutex_lock(&vdev->vma_lock);
down_read(&vdev->memory_lock);

- if (!__vfio_pci_memory_enabled(vdev)) {
+ /*
+ * Memory region cannot be accessed if the low power feature is engaged
+ * or memory access is disabled.
+ */
+ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
ret = VM_FAULT_SIGBUS;
goto up_out;
}
@@ -2219,6 +2344,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
goto err_unlock;
}

+ /*
+ * Some of the devices in the dev_set can be in the runtime suspended
+ * state. Increment the usage count for all the devices in the dev_set
+ * before reset and decrement the same after reset.
+ */
+ ret = vfio_pci_dev_set_pm_runtime_get(dev_set);
+ if (ret)
+ goto err_unlock;
+
list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
/*
* Test whether all the affected devices are contained by the
@@ -2274,6 +2408,9 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
else
mutex_unlock(&cur->vma_lock);
}
+
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+ pm_runtime_put(&cur->pdev->dev);
err_unlock:
mutex_unlock(&dev_set->lock);
return ret;
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index a0f1f36e42a2..1025d53fde0b 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -79,6 +79,7 @@ struct vfio_pci_core_device {
bool nointx;
bool needs_pm_restore;
bool pm_intx_masked;
+ bool pm_runtime_engaged;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
--
2.17.1