Re: [PATCH 11/17] fpga: dfl: afu: add error reporting support.

From: Alan Tull
Date: Tue Apr 09 2019 - 16:58:19 EST


On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@xxxxxxxxx> wrote:

Hi Hao,

>
> Error reporting is one important private feature, it reports error
> detected on port and accelerated function unit (AFU). It introduces
> several sysfs interfaces to allow userspace to check and clear
> errors detected by hardware.
>
> Signed-off-by: Xu Yilun <yilun.xu@xxxxxxxxx>
> Signed-off-by: Wu Hao <hao.wu@xxxxxxxxx>
> ---
> Documentation/ABI/testing/sysfs-platform-dfl-port | 29 +++
> drivers/fpga/Makefile | 1 +
> drivers/fpga/dfl-afu-error.c | 225 ++++++++++++++++++++++
> drivers/fpga/dfl-afu-main.c | 4 +
> drivers/fpga/dfl-afu.h | 4 +
> 5 files changed, 263 insertions(+)
> create mode 100644 drivers/fpga/dfl-afu-error.c
>
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
> index f611e47..e6140aa 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-port
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
> @@ -79,3 +79,32 @@ KernelVersion: 5.2
> Contact: Wu Hao <hao.wu@xxxxxxxxx>
> Description: Read-only. Read this file to get the status of issued command
> to userclck_freqcntrcmd.
> +
> +What: /sys/bus/platform/devices/dfl-port.0/errors/errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get errors detected on port and
> + Accelerated Function Unit (AFU).
> +
> +What: /sys/bus/platform/devices/dfl-port.0/errors/first_error
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get the first error detected by
> + hardware.
> +
> +What: /sys/bus/platform/devices/dfl-port.0/errors/first_malformed_req
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get the first malformed request
> + captured by hardware.
> +
> +What: /sys/bus/platform/devices/dfl-port.0/errors/clear
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Write-only. Write error code to this file to clear errors. If
> + the input error code doesn't match, it returns -EBUSY error
> + code.

I understand how -EBUSY could be the right error code for when the
hardware is in a state where the error can't be cleared. But if the
input error code doesn't match, shouldn't the code be -EINVAL? Also
as noted below, the way this is currently coded, -ETIMEDOUT could get
returned.

> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index c0dd4c8..f1f0af7 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -40,6 +40,7 @@ obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
>
> dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> +dfl-afu-objs += dfl-afu-error.o
>
> # Drivers for FPGAs which implement DFL
> obj-$(CONFIG_FPGA_DFL_PCI) += dfl-pci.o
> diff --git a/drivers/fpga/dfl-afu-error.c b/drivers/fpga/dfl-afu-error.c
> new file mode 100644
> index 0000000..b66bd4a
> --- /dev/null
> +++ b/drivers/fpga/dfl-afu-error.c
> @@ -0,0 +1,225 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Accelerated Function Unit (AFU) Error Reporting
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + * Wu Hao <hao.wu@xxxxxxxxxxxxxxx>
> + * Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
> + * Joseph Grecco <joe.grecco@xxxxxxxxx>
> + * Enno Luebbers <enno.luebbers@xxxxxxxxx>
> + * Tim Whisonant <tim.whisonant@xxxxxxxxx>
> + * Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> + * Mitchel Henry <henry.mitchel@xxxxxxxxx>
> + */
> +
> +#include <linux/uaccess.h>
> +
> +#include "dfl-afu.h"
> +
> +#define PORT_ERROR_MASK 0x8
> +#define PORT_ERROR 0x10
> +#define PORT_FIRST_ERROR 0x18
> +#define PORT_MALFORMED_REQ0 0x20
> +#define PORT_MALFORMED_REQ1 0x28
> +
> +#define ERROR_MASK GENMASK_ULL(63, 0)
> +
> +/* mask or unmask port errors by the error mask register. */
> +static void __port_err_mask(struct device *dev, bool mask)
> +{
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> + writeq(mask ? ERROR_MASK : 0, base + PORT_ERROR_MASK);
> +}
> +
> +/* clear port errors. */
> +static int __port_err_clear(struct device *dev, u64 err)
> +{
> + struct platform_device *pdev = to_platform_device(dev);
> + void __iomem *base_err, *base_hdr;
> + int ret;
> + u64 v;
> +
> + base_err = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> + base_hdr = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_HEADER);
> +
> + /*
> + * clear Port Errors
> + *
> + * - Check for AP6 State
> + * - Halt Port by keeping Port in reset
> + * - Set PORT Error mask to all 1 to mask errors
> + * - Clear all errors
> + * - Set Port mask to all 0 to enable errors
> + * - All errors start capturing new errors
> + * - Enable Port by pulling the port out of reset
> + */
> +
> + /* if device is still in AP6 power state, can not clear any error. */
> + v = readq(base_hdr + PORT_HDR_STS);
> + if (FIELD_GET(PORT_STS_PWR_STATE, v) == PORT_STS_PWR_STATE_AP6) {
> + dev_err(dev, "Could not clear errors, device in AP6 state.\n");
> + return -EBUSY;
> + }
> +
> + /* Halt Port by keeping Port in reset */
> + ret = __port_disable(pdev);
> + if (ret)
> + return ret;

__port_disable can return -ETIMEDOUT which will then get returned from
clear_store. The sysfs document only talks about -EBUSY. You could
either document -ETIMEDOUT in the sysfs doc or you could change the
code to adjust the returned error code.

> +
> + /* Mask all errors */
> + __port_err_mask(dev, true);
> +
> + /* Clear errors if err input matches with current port errors.*/
> + v = readq(base_err + PORT_ERROR);
> +
> + if (v == err) {
> + writeq(v, base_err + PORT_ERROR);
> +
> + v = readq(base_err + PORT_FIRST_ERROR);
> + writeq(v, base_err + PORT_FIRST_ERROR);
> + } else {
> + ret = -EBUSY;
> + }
> +
> + /* Clear mask */
> + __port_err_mask(dev, false);
> +
> + /* Enable the Port by clear the reset */
> + __port_enable(pdev);
> +
> + return ret;
> +}
> +
> +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> + return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);

This appears to be adding a
/sys/bus/platform/devices/dfl-port.0/errors/revision attribute that
isn't documented in the sysfs document.

> +
> +static ssize_t errors_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> + void __iomem *base;
> + u64 error;
> +
> + base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> + mutex_lock(&pdata->lock);
> + error = readq(base + PORT_ERROR);
> + mutex_unlock(&pdata->lock);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", (unsigned long long)error);
> +}
> +static DEVICE_ATTR_RO(errors);
> +
> +static ssize_t first_error_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> + void __iomem *base;
> + u64 error;
> +
> + base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> + mutex_lock(&pdata->lock);
> + error = readq(base + PORT_FIRST_ERROR);
> + mutex_unlock(&pdata->lock);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", (unsigned long long)error);
> +}
> +static DEVICE_ATTR_RO(first_error);
> +
> +static ssize_t first_malformed_req_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> + void __iomem *base;
> + u64 req0, req1;
> +
> + base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_ERROR);
> +
> + mutex_lock(&pdata->lock);
> + req0 = readq(base + PORT_MALFORMED_REQ0);
> + req1 = readq(base + PORT_MALFORMED_REQ1);
> + mutex_unlock(&pdata->lock);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%016llx%016llx\n",
> + (unsigned long long)req1, (unsigned long long)req0);
> +}
> +static DEVICE_ATTR_RO(first_malformed_req);
> +
> +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> + const char *buff, size_t count)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
> + u64 value;
> + int ret;
> +
> + if (kstrtou64(buff, 0, &value))
> + return -EINVAL;
> +
> + mutex_lock(&pdata->lock);
> + ret = __port_err_clear(dev, value);
> + mutex_unlock(&pdata->lock);
> +
> + return ret ? ret : count;
> +}
> +static DEVICE_ATTR_WO(clear);
> +
> +static struct attribute *port_err_attrs[] = {
> + &dev_attr_revision.attr,
> + &dev_attr_errors.attr,
> + &dev_attr_first_error.attr,
> + &dev_attr_first_malformed_req.attr,
> + &dev_attr_clear.attr,
> + NULL,
> +};
> +
> +static struct attribute_group port_err_attr_group = {
> + .attrs = port_err_attrs,
> + .name = "errors",
> +};
> +
> +static int port_err_init(struct platform_device *pdev,
> + struct dfl_feature *feature)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
> +
> + dev_dbg(&pdev->dev, "PORT ERR Init.\n");
> +
> + mutex_lock(&pdata->lock);
> + __port_err_mask(&pdev->dev, false);
> + mutex_unlock(&pdata->lock);
> +
> + return sysfs_create_group(&pdev->dev.kobj, &port_err_attr_group);
> +}
> +
> +static void port_err_uinit(struct platform_device *pdev,
> + struct dfl_feature *feature)
> +{
> + dev_dbg(&pdev->dev, "PORT ERR UInit.\n");
> +
> + sysfs_remove_group(&pdev->dev.kobj, &port_err_attr_group);
> +}
> +
> +const struct dfl_feature_id port_err_id_table[] = {
> + {.id = PORT_FEATURE_ID_ERROR,},
> + {0,}
> +};
> +
> +const struct dfl_feature_ops port_err_ops = {
> + .init = port_err_init,
> + .uinit = port_err_uinit,
> +};
> diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
> index e727d9b..754729e 100644
> --- a/drivers/fpga/dfl-afu-main.c
> +++ b/drivers/fpga/dfl-afu-main.c
> @@ -528,6 +528,10 @@ static struct dfl_feature_driver port_feature_drvs[] = {
> .ops = &port_afu_ops,
> },
> {
> + .id_table = port_err_id_table,
> + .ops = &port_err_ops,
> + },
> + {
> .ops = NULL,
> }
> };
> diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
> index 35e60c5..c3182a2 100644
> --- a/drivers/fpga/dfl-afu.h
> +++ b/drivers/fpga/dfl-afu.h
> @@ -100,4 +100,8 @@ int afu_dma_unmap_region(struct dfl_feature_platform_data *pdata, u64 iova);
> struct dfl_afu_dma_region *
> afu_dma_region_find(struct dfl_feature_platform_data *pdata,
> u64 iova, u64 size);
> +
> +extern const struct dfl_feature_ops port_err_ops;
> +extern const struct dfl_feature_id port_err_id_table[];
> +
> #endif /* __DFL_AFU_H */
> --
> 2.7.4
>

Thanks,
Alan