Re: [PATCH] hwmon: corsair-psu: add support for critical values

From: Guenter Roeck
Date: Sun Mar 14 2021 - 14:23:10 EST


On 3/14/21 5:42 AM, Wilken Gottwalt wrote:
> Adds support for reading the critical values of the temperature sensors
> and the rail sensors (voltage and current) once and caches them. Updates
> the naming of the constants following a more clear scheme. Also updates
> the documentation and fixes a typo.
>
> The new sensors output of a Corsair HX850i will look like this:
> corsairpsu-hid-3-1
> Adapter: HID adapter
> v_in: 230.00 V
> v_out +12v: 12.14 V (crit min = +8.41 V, crit max = +15.59 V)
> v_out +5v: 5.03 V (crit min = +3.50 V, crit max = +6.50 V)
> v_out +3.3v: 3.30 V (crit min = +2.31 V, crit max = +4.30 V)
> psu fan: 0 RPM
> vrm temp: +46.2°C (crit = +70.0°C)
> case temp: +39.8°C (crit = +70.0°C)
> power total: 152.00 W
> power +12v: 108.00 W
> power +5v: 41.00 W
> power +3.3v: 5.00 W
> curr in: N/A
> curr +12v: 9.00 A (crit max = +85.00 A)
> curr +5v: 8.31 A (crit max = +40.00 A)
> curr +3.3v: 1.62 A (crit max = +40.00 A)
>
> This patch changes:
> - hwmon corsair-psu documentation
> - hwmon corsair-psu driver
>
> Signed-off-by: Wilken Gottwalt <wilken.gottwalt@xxxxxxxxxx>
> ---
> Documentation/hwmon/corsair-psu.rst | 13 +-
> drivers/hwmon/corsair-psu.c | 185 ++++++++++++++++++++++------
> 2 files changed, 157 insertions(+), 41 deletions(-)
>
> diff --git a/Documentation/hwmon/corsair-psu.rst b/Documentation/hwmon/corsair-psu.rst
> index 396b95c9a76a..b77e53810a13 100644
> --- a/Documentation/hwmon/corsair-psu.rst
> +++ b/Documentation/hwmon/corsair-psu.rst
> @@ -45,19 +45,30 @@ Sysfs entries
> -------------
>
> ======================= ========================================================
> +curr2_crit Current max critical value on the 12v psu rail
> +curr3_crit Current max critical value on the 5v psu rail
> +curr4_crit Current max critical value on the 3.3v psu rail
> curr1_input Total current usage
> curr2_input Current on the 12v psu rail
> curr3_input Current on the 5v psu rail
> curr4_input Current on the 3.3v psu rail
> fan1_input RPM of psu fan
> +in1_crit Voltage max critical value on the 12v psu rail
> +in2_crit Voltage max critical value on the 5v psu rail
> +in3_crit Voltage max critical value on the 3.3v psu rail
> in0_input Voltage of the psu ac input
> in1_input Voltage of the 12v psu rail
> in2_input Voltage of the 5v psu rail
> -in3_input Voltage of the 3.3 psu rail
> +in3_input Voltage of the 3.3v psu rail
> +in1_lcrit Voltage min critical value on the 12v psu rail
> +in2_lcrit Voltage min critical value on the 5v psu rail
> +in3_lcrit Voltage min critical value on the 3.3v psu rail
> power1_input Total power usage
> power2_input Power usage of the 12v psu rail
> power3_input Power usage of the 5v psu rail
> power4_input Power usage of the 3.3v psu rail
> +temp1_crit Temperature max cirtical value of the psu vrm component
> +temp2_crit Temperature max critical value of psu case
> temp1_input Temperature of the psu vrm component
> temp2_input Temperature of the psu case
> ======================= ========================================================
> diff --git a/drivers/hwmon/corsair-psu.c b/drivers/hwmon/corsair-psu.c
> index b0953eeeb2d3..141a5079ea7e 100644
> --- a/drivers/hwmon/corsair-psu.c
> +++ b/drivers/hwmon/corsair-psu.c
> @@ -53,11 +53,21 @@
> #define CMD_TIMEOUT_MS 250
> #define SECONDS_PER_HOUR (60 * 60)
> #define SECONDS_PER_DAY (SECONDS_PER_HOUR * 24)
> +#define RAIL_COUNT 3 /* 3v3 + 5v + 12v */
> +#define CRIT_VALUES_COUNT 11 /* 2 temp crit + 6 rail volts (low and high) + 3 rails amps */
> +#define TEMP_HCRIT 0
> +#define VOLTS_RAIL_HCRIT 2
> +#define VOLTS_RAIL_LCRIT 5
> +#define AMPS_RAIL_HCRIT 8
>
> #define PSU_CMD_SELECT_RAIL 0x00 /* expects length 2 */
> -#define PSU_CMD_IN_VOLTS 0x88 /* the rest of the commands expect length 3 */
> +#define PSU_CMD_RAIL_VOLTS_HCRIT 0x40 /* the rest of the commands expect length 3 */
> +#define PSU_CMD_RAIL_VOLTS_LCRIT 0x44
> +#define PSU_CMD_RAIL_AMPS_HCRIT 0x46
> +#define PSU_CMD_TEMP_HCRIT 0x4F
> +#define PSU_CMD_IN_VOLTS 0x88
> #define PSU_CMD_IN_AMPS 0x89
> -#define PSU_CMD_RAIL_OUT_VOLTS 0x8B
> +#define PSU_CMD_RAIL_VOLTS 0x8B
> #define PSU_CMD_RAIL_AMPS 0x8C
> #define PSU_CMD_TEMP0 0x8D
> #define PSU_CMD_TEMP1 0x8E
> @@ -113,6 +123,7 @@ struct corsairpsu_data {
> struct dentry *debugfs;
> struct completion wait_completion;
> struct mutex lock; /* for locking access to cmd_buffer */
> + long crit_values[CRIT_VALUES_COUNT];
> u8 *cmd_buffer;
> char vendor[REPLY_SIZE];
> char product[REPLY_SIZE];
> @@ -193,7 +204,10 @@ static int corsairpsu_request(struct corsairpsu_data *priv, u8 cmd, u8 rail, voi
>
> mutex_lock(&priv->lock);
> switch (cmd) {
> - case PSU_CMD_RAIL_OUT_VOLTS:
> + case PSU_CMD_RAIL_VOLTS_HCRIT:
> + case PSU_CMD_RAIL_VOLTS_LCRIT:
> + case PSU_CMD_RAIL_AMPS_HCRIT:
> + case PSU_CMD_RAIL_VOLTS:
> case PSU_CMD_RAIL_AMPS:
> case PSU_CMD_RAIL_WATTS:
> ret = corsairpsu_usb_cmd(priv, 2, PSU_CMD_SELECT_RAIL, rail, NULL);
> @@ -229,9 +243,13 @@ static int corsairpsu_get_value(struct corsairpsu_data *priv, u8 cmd, u8 rail, l
> */
> tmp = ((long)data[3] << 24) + (data[2] << 16) + (data[1] << 8) + data[0];
> switch (cmd) {
> + case PSU_CMD_RAIL_VOLTS_HCRIT:
> + case PSU_CMD_RAIL_VOLTS_LCRIT:
> + case PSU_CMD_RAIL_AMPS_HCRIT:
> + case PSU_CMD_TEMP_HCRIT:
> case PSU_CMD_IN_VOLTS:
> case PSU_CMD_IN_AMPS:
> - case PSU_CMD_RAIL_OUT_VOLTS:
> + case PSU_CMD_RAIL_VOLTS:
> case PSU_CMD_RAIL_AMPS:
> case PSU_CMD_TEMP0:
> case PSU_CMD_TEMP1:
> @@ -256,18 +274,70 @@ static int corsairpsu_get_value(struct corsairpsu_data *priv, u8 cmd, u8 rail, l
> return ret;
> }
>
> +static void corsairpsu_get_criticals(struct corsairpsu_data *priv)
> +{
> + long tmp;
> + int rail;
> + int ret;
> +
> + ret = corsairpsu_get_value(priv, PSU_CMD_TEMP_HCRIT, 0, &tmp);
> + if (ret < 0)
> + pr_debug("%s: unable to read temp0 critical value\n", DRIVER_NAME);
> + else
> + priv->crit_values[TEMP_HCRIT] = tmp;
> +
> + ret = corsairpsu_get_value(priv, PSU_CMD_TEMP_HCRIT, 1, &tmp);
> + if (ret < 0)
> + pr_debug("%s: unable to read temp1 cirtical value\n", DRIVER_NAME);
> + else
> + priv->crit_values[TEMP_HCRIT + 1] = tmp;
> +
> + for (rail = 0; rail < RAIL_COUNT; ++rail) {
> + ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS_HCRIT, rail, &tmp);
> + if (ret < 0) {
> + pr_debug("%s: unable to read volts rail %d high critical value\n",
> + DRIVER_NAME, rail);
> + } else {
> + priv->crit_values[VOLTS_RAIL_HCRIT + rail] = tmp;
> + }
> + }
> +
> + for (rail = 0; rail < RAIL_COUNT; ++rail) {
> + ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS_LCRIT, rail, &tmp);
> + if (ret < 0) {
> + pr_debug("%s: unable to read volts rail %d low critical value\n",
> + DRIVER_NAME, rail);
> + } else {
> + priv->crit_values[VOLTS_RAIL_LCRIT + rail] = tmp;
> + }

I am not happy with this code. First, it is quite complex. Second,
it uses crit_values[] to indicate both valid limits and error codes.
That is mostly fine until there is ever a negative limit (which can happen
if there is ever a low temperature limit).

It would be much better to have a limit_supported bit map as well as limit arrays.
On top of that, I am not sure if bundling all limits into a single array is worth it.
Something like

ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS_LCRIT, rail, &tmp);
if (ret == 0) {
priv->lcrit_in_supported |= BIT(rail);
priv->lcrit_in[rail] = tmp;
}

would be much easier to understand, and it would and be much less error prone.

The is_visible function could then simply check for
if (priv->lcrit_in_supported & BIT(channel))
to determine if limits are supported for a channel.

I also don't see a value in the debug messages. That is either is a bug, or it is
normal operation for some PSUs. If it is a bug, it should be reported as such
and result in an error. If it is normal operation, the result can be seen
in the non-existing attribute, and there is no need for an extra message.

> + }
> +
> + for (rail = 0; rail < RAIL_COUNT; ++rail) {
> + ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_AMPS_HCRIT, rail, &tmp);
> + if (ret < 0) {
> + pr_debug("%s: unable to read amps rail %d hight critical value\n",
> + DRIVER_NAME, rail);
> + } else {
> + priv->crit_values[AMPS_RAIL_HCRIT + rail] = tmp;
> + }
> + }
> +}
> +
> static umode_t corsairpsu_hwmon_ops_is_visible(const void *data, enum hwmon_sensor_types type,
> u32 attr, int channel)
> {
> - if (type == hwmon_temp && (attr == hwmon_temp_input || attr == hwmon_temp_label))
> + if (type == hwmon_temp && (attr == hwmon_temp_input || attr == hwmon_temp_label ||
> + attr == hwmon_temp_crit))
> return 0444;
> else if (type == hwmon_fan && (attr == hwmon_fan_input || attr == hwmon_fan_label))
> return 0444;
> else if (type == hwmon_power && (attr == hwmon_power_input || attr == hwmon_power_label))
> return 0444;
> - else if (type == hwmon_in && (attr == hwmon_in_input || attr == hwmon_in_label))
> + else if (type == hwmon_in && (attr == hwmon_in_input || attr == hwmon_in_label ||
> + attr == hwmon_in_lcrit || attr == hwmon_in_crit))
> return 0444;
> - else if (type == hwmon_curr && (attr == hwmon_curr_input || attr == hwmon_curr_label))
> + else if (type == hwmon_curr && (attr == hwmon_curr_input || attr == hwmon_curr_label ||
> + attr == hwmon_curr_crit))
> return 0444;
>
> return 0;
> @@ -277,11 +347,18 @@ static int corsairpsu_hwmon_ops_read(struct device *dev, enum hwmon_sensor_types
> int channel, long *val)
> {
> struct corsairpsu_data *priv = dev_get_drvdata(dev);
> - int ret;
> -
> - if (type == hwmon_temp && attr == hwmon_temp_input && channel < 2) {
> - ret = corsairpsu_get_value(priv, channel ? PSU_CMD_TEMP1 : PSU_CMD_TEMP0, channel,
> - val);
> + int ret = 0;
> +
> + if (type == hwmon_temp && channel < 2) {
> + if (attr == hwmon_temp_input) {
> + ret = corsairpsu_get_value(priv, channel ? PSU_CMD_TEMP1 : PSU_CMD_TEMP0,
> + channel, val);
> + } else if (attr == hwmon_temp_crit) {
> + if (priv->crit_values[TEMP_HCRIT + channel] != -EOPNOTSUPP)
> + *val = priv->crit_values[TEMP_HCRIT + channel];
> + else
> + ret = -EOPNOTSUPP;
> + }
> } else if (type == hwmon_fan && attr == hwmon_fan_input) {
> ret = corsairpsu_get_value(priv, PSU_CMD_FAN, 0, val);
> } else if (type == hwmon_power && attr == hwmon_power_input) {
> @@ -295,27 +372,48 @@ static int corsairpsu_hwmon_ops_read(struct device *dev, enum hwmon_sensor_types
> default:
> return -EOPNOTSUPP;
> }
> - } else if (type == hwmon_in && attr == hwmon_in_input) {
> - switch (channel) {
> - case 0:
> - ret = corsairpsu_get_value(priv, PSU_CMD_IN_VOLTS, 0, val);
> - break;
> - case 1 ... 3:
> - ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_OUT_VOLTS, channel - 1, val);
> - break;
> - default:
> - return -EOPNOTSUPP;
> + } else if (type == hwmon_in) {
> + if (attr == hwmon_in_input) {
> + switch (channel) {
> + case 0:
> + ret = corsairpsu_get_value(priv, PSU_CMD_IN_VOLTS, 0, val);
> + break;
> + case 1 ... 3:
> + ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS, channel - 1,
> + val);
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + } else if (attr == hwmon_in_crit && channel > 0 && channel < 4) {
> + if (priv->crit_values[VOLTS_RAIL_HCRIT + channel - 1] != -EOPNOTSUPP)
> + *val = priv->crit_values[VOLTS_RAIL_HCRIT + channel - 1];
> + else
> + ret = -EOPNOTSUPP;
> + } else if (attr == hwmon_in_lcrit && channel > 0 && channel < 4) {
> + if (priv->crit_values[VOLTS_RAIL_LCRIT + channel - 1] != -EOPNOTSUPP)
> + *val = priv->crit_values[VOLTS_RAIL_LCRIT + channel - 1];
> + else
> + ret = -EOPNOTSUPP;
> }
> - } else if (type == hwmon_curr && attr == hwmon_curr_input) {
> - switch (channel) {
> - case 0:
> - ret = corsairpsu_get_value(priv, PSU_CMD_IN_AMPS, 0, val);
> - break;
> - case 1 ... 3:
> - ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_AMPS, channel - 1, val);
> - break;
> - default:
> - return -EOPNOTSUPP;
> + } else if (type == hwmon_curr) {
> + if (attr == hwmon_curr_input) {
> + switch (channel) {
> + case 0:
> + ret = corsairpsu_get_value(priv, PSU_CMD_IN_AMPS, 0, val);
> + break;
> + case 1 ... 3:
> + ret = corsairpsu_get_value(priv, PSU_CMD_RAIL_AMPS, channel - 1,
> + val);
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + } else if (attr == hwmon_curr_crit && channel > 0 && channel < 4) {
> + if (priv->crit_values[AMPS_RAIL_HCRIT + channel - 1] != -EOPNOTSUPP)
> + *val = priv->crit_values[AMPS_RAIL_HCRIT + channel - 1];
> + else
> + ret = -EOPNOTSUPP;
> }

Due to the channel range check, this returns a random value if the function is ever called
with attr == hwmon_curr_crit && channel == 0. This shows that the channel check is
really unnecessary (but also that the code is getting dificult to understand).

> } else {
> return -EOPNOTSUPP;

I think it is time to split this code into per-type functions. It is getting unreadable.

> @@ -360,8 +458,8 @@ static const struct hwmon_channel_info *corsairpsu_info[] = {
> HWMON_CHANNEL_INFO(chip,
> HWMON_C_REGISTER_TZ),
> HWMON_CHANNEL_INFO(temp,
> - HWMON_T_INPUT | HWMON_T_LABEL,
> - HWMON_T_INPUT | HWMON_T_LABEL),
> + HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_CRIT,
> + HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_CRIT),
> HWMON_CHANNEL_INFO(fan,
> HWMON_F_INPUT | HWMON_F_LABEL),
> HWMON_CHANNEL_INFO(power,
> @@ -371,14 +469,14 @@ static const struct hwmon_channel_info *corsairpsu_info[] = {
> HWMON_P_INPUT | HWMON_P_LABEL),
> HWMON_CHANNEL_INFO(in,
> HWMON_I_INPUT | HWMON_I_LABEL,
> - HWMON_I_INPUT | HWMON_I_LABEL,
> - HWMON_I_INPUT | HWMON_I_LABEL,
> - HWMON_I_INPUT | HWMON_I_LABEL),
> + HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT,
> + HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT,
> + HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT),
> HWMON_CHANNEL_INFO(curr,
> HWMON_C_INPUT | HWMON_C_LABEL,
> - HWMON_C_INPUT | HWMON_C_LABEL,
> - HWMON_C_INPUT | HWMON_C_LABEL,
> - HWMON_C_INPUT | HWMON_C_LABEL),
> + HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT,
> + HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT,
> + HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT),
> NULL
> };
>
> @@ -472,6 +570,7 @@ static void corsairpsu_debugfs_init(struct corsairpsu_data *priv)
> static int corsairpsu_probe(struct hid_device *hdev, const struct hid_device_id *id)
> {
> struct corsairpsu_data *priv;
> + int i;
> int ret;
>
> priv = devm_kzalloc(&hdev->dev, sizeof(struct corsairpsu_data), GFP_KERNEL);
> @@ -482,6 +581,9 @@ static int corsairpsu_probe(struct hid_device *hdev, const struct hid_device_id
> if (!priv->cmd_buffer)
> return -ENOMEM;
>
> + for (i = 0; i < CRIT_VALUES_COUNT; ++i)
> + priv->crit_values[i] = -EOPNOTSUPP;
> +
> ret = hid_parse(hdev);
> if (ret)
> return ret;
> @@ -513,6 +615,9 @@ static int corsairpsu_probe(struct hid_device *hdev, const struct hid_device_id
> goto fail_and_stop;
> }
>
> + /* this can fail and is considered non-fatal */
> + corsairpsu_get_criticals(priv);
> +

The comment does not add value, but it does add confusion. "Fail" implies
that something is wrong. However, I suspect that it only means that not all
power supplies report limits. That is not a failure, it is normal operation.

Thanks,
Guenter

> priv->hwmon_dev = hwmon_device_register_with_info(&hdev->dev, "corsairpsu", priv,
> &corsairpsu_chip_info, 0);
>
>