summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-09-09 19:21:32 -0700
committerJakub Kicinski <kuba@kernel.org>2025-09-09 19:21:32 -0700
commitcf71bdf6863f0aebd00aeccce313b8833ee9c7f4 (patch)
treed876b60619d6dd8051ebf3f2e8a544fcacaf907b
parent04d1ff1d75ba76d0c5df1513acebf43973788b17 (diff)
parentcdc492746e3f6d73a9e6a6a9962c9f1f7b7961b5 (diff)
Merge branch 'net-mlx5e-add-pcie-congestion-event-extras'
Tariq Toukan says: ==================== net/mlx5e: Add pcie congestion event extras This small series by Dragos covers gaps requested in the initial pcie congestion series [1]: - Make pcie congestion thresholds configurable via devlink. - Add a counter for stale pcie congestion events. [1] https://lore.kernel.org/1752130292-22249-1-git-send-email-tariqt@nvidia.com ==================== Link: https://patch.msgid.link/1757237976-531416-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst7
-rw-r--r--Documentation/networking/devlink/mlx5.rst52
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/devlink.c106
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/devlink.h4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c79
5 files changed, 238 insertions, 10 deletions
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
index 754c81436408..cc498895f92e 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
@@ -1348,7 +1348,7 @@ Device Counters
is in a congested state.
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
- - Tnformative
+ - Informative
* - `pci_bw_inbound_low`
- The number of times the device crossed the low inbound PCIe bandwidth
@@ -1373,3 +1373,8 @@ Device Counters
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
- Informative
+
+ * - `pci_bw_stale_event`
+ - The number of times the device fired a PCIe congestion event but on query
+ there was no change in state.
+ - Informative
diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst
index 07b1424cbfbb..60cc9fedf1ef 100644
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -146,6 +146,58 @@ parameters.
- u32
- driverinit
- Control the size (in packets) of the hairpin queues.
+ * - ``pcie_cong_inbound_high``
+ - u16
+ - driverinit
+ - High threshold configuration for PCIe congestion events. The firmware
+ will send an event once device side inbound PCIe traffic went
+ above the configured high threshold for a long enough period (at least
+ 200ms).
+
+ See pci_bw_inbound_high ethtool stat.
+
+ Units are 0.01 %. Accepted values are in range [0, 10000].
+ pcie_cong_inbound_low < pcie_cong_inbound_high.
+ Default value: 9000 (Corresponds to 90%).
+ * - ``pcie_cong_inbound_low``
+ - u16
+ - driverinit
+ - Low threshold configuration for PCIe congestion events. The firmware
+ will send an event once device side inbound PCIe traffic went
+ below the configured low threshold, only after having been previously in
+ a congested state.
+
+ See pci_bw_inbound_low ethtool stat.
+
+ Units are 0.01 %. Accepted values are in range [0, 10000].
+ pcie_cong_inbound_low < pcie_cong_inbound_high.
+ Default value: 7500.
+ * - ``pcie_cong_outbound_high``
+ - u16
+ - driverinit
+ - High threshold configuration for PCIe congestion events. The firmware
+ will send an event once device side outbound PCIe traffic went
+ above the configured high threshold for a long enough period (at least
+ 200ms).
+
+ See pci_bw_outbound_high ethtool stat.
+
+ Units are 0.01 %. Accepted values are in range [0, 10000].
+ pcie_cong_outbound_low < pcie_cong_outbound_high.
+ Default value: 9000 (Corresponds to 90%).
+ * - ``pcie_cong_outbound_low``
+ - u16
+ - driverinit
+ - Low threshold configuration for PCIe congestion events. The firmware
+ will send an event once device side outbound PCIe traffic went
+ below the configured low threshold, only after having been previously in
+ a congested state.
+
+ See pci_bw_outbound_low ethtool stat.
+
+ Units are 0.01 %. Accepted values are in range [0, 10000].
+ pcie_cong_outbound_low < pcie_cong_outbound_high.
+ Default value: 7500.
* - ``cqe_compress_type``
- string
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 326d438e75b5..a0b68321355a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -651,6 +651,105 @@ static void mlx5_devlink_eth_params_unregister(struct devlink *devlink)
ARRAY_SIZE(mlx5_devlink_eth_params));
}
+#define MLX5_PCIE_CONG_THRESH_MAX 10000
+#define MLX5_PCIE_CONG_THRESH_DEF_LOW 7500
+#define MLX5_PCIE_CONG_THRESH_DEF_HIGH 9000
+
+static int
+mlx5_devlink_pcie_cong_thresh_validate(struct devlink *devl, u32 id,
+ union devlink_param_value val,
+ struct netlink_ext_ack *extack)
+{
+ if (val.vu16 > MLX5_PCIE_CONG_THRESH_MAX) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Value %u > max supported (%u)",
+ val.vu16, MLX5_PCIE_CONG_THRESH_MAX);
+
+ return -EINVAL;
+ }
+
+ switch (id) {
+ case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW:
+ case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH:
+ case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW:
+ case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void mlx5_devlink_pcie_cong_init_values(struct devlink *devlink)
+{
+ union devlink_param_value value;
+ u32 id;
+
+ value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
+ id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW;
+ devl_param_driverinit_value_set(devlink, id, value);
+
+ value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
+ id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH;
+ devl_param_driverinit_value_set(devlink, id, value);
+
+ value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
+ id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW;
+ devl_param_driverinit_value_set(devlink, id, value);
+
+ value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
+ id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH;
+ devl_param_driverinit_value_set(devlink, id, value);
+}
+
+static const struct devlink_param mlx5_devlink_pcie_cong_params[] = {
+ DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
+ "pcie_cong_inbound_low", DEVLINK_PARAM_TYPE_U16,
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
+ mlx5_devlink_pcie_cong_thresh_validate),
+ DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
+ "pcie_cong_inbound_high", DEVLINK_PARAM_TYPE_U16,
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
+ mlx5_devlink_pcie_cong_thresh_validate),
+ DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
+ "pcie_cong_outbound_low", DEVLINK_PARAM_TYPE_U16,
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
+ mlx5_devlink_pcie_cong_thresh_validate),
+ DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
+ "pcie_cong_outbound_high", DEVLINK_PARAM_TYPE_U16,
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
+ mlx5_devlink_pcie_cong_thresh_validate),
+};
+
+static int mlx5_devlink_pcie_cong_params_register(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ int err;
+
+ if (!mlx5_pcie_cong_event_supported(dev))
+ return 0;
+
+ err = devl_params_register(devlink, mlx5_devlink_pcie_cong_params,
+ ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
+ if (err)
+ return err;
+
+ mlx5_devlink_pcie_cong_init_values(devlink);
+
+ return 0;
+}
+
+static void mlx5_devlink_pcie_cong_params_unregister(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+ if (!mlx5_pcie_cong_event_supported(dev))
+ return;
+
+ devl_params_unregister(devlink, mlx5_devlink_pcie_cong_params,
+ ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
+}
+
static int mlx5_devlink_enable_rdma_validate(struct devlink *devlink, u32 id,
union devlink_param_value val,
struct netlink_ext_ack *extack)
@@ -896,6 +995,10 @@ int mlx5_devlink_params_register(struct devlink *devlink)
if (err)
goto max_uc_list_err;
+ err = mlx5_devlink_pcie_cong_params_register(devlink);
+ if (err)
+ goto pcie_cong_err;
+
err = mlx5_nv_param_register_dl_params(devlink);
if (err)
goto nv_param_err;
@@ -903,6 +1006,8 @@ int mlx5_devlink_params_register(struct devlink *devlink)
return 0;
nv_param_err:
+ mlx5_devlink_pcie_cong_params_unregister(devlink);
+pcie_cong_err:
mlx5_devlink_max_uc_list_params_unregister(devlink);
max_uc_list_err:
mlx5_devlink_auxdev_params_unregister(devlink);
@@ -915,6 +1020,7 @@ auxdev_reg_err:
void mlx5_devlink_params_unregister(struct devlink *devlink)
{
mlx5_nv_param_unregister_dl_params(devlink);
+ mlx5_devlink_pcie_cong_params_unregister(devlink);
mlx5_devlink_max_uc_list_params_unregister(devlink);
mlx5_devlink_auxdev_params_unregister(devlink);
devl_params_unregister(devlink, mlx5_devlink_params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 74bcdfa70361..c9555119a661 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -22,6 +22,10 @@ enum mlx5_devlink_param_id {
MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT,
MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES,
MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c
index 0ed017569a19..2eb666a46f39 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+#include "../devlink.h"
#include "en.h"
#include "pcie_cong_event.h"
@@ -23,6 +24,7 @@ struct mlx5e_pcie_cong_stats {
u32 pci_bw_inbound_low;
u32 pci_bw_outbound_high;
u32 pci_bw_outbound_low;
+ u32 pci_bw_stale_event;
};
struct mlx5e_pcie_cong_event {
@@ -41,13 +43,6 @@ struct mlx5e_pcie_cong_event {
struct mlx5e_pcie_cong_stats stats;
};
-/* In units of 0.01 % */
-static const struct mlx5e_pcie_cong_thresh default_thresh_config = {
- .inbound_high = 9000,
- .inbound_low = 7500,
- .outbound_high = 9000,
- .outbound_low = 7500,
-};
static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
@@ -58,6 +53,8 @@ static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
pci_bw_outbound_high) },
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
pci_bw_outbound_low) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
+ pci_bw_stale_event) },
};
#define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc)
@@ -218,8 +215,10 @@ static void mlx5e_pcie_cong_event_work(struct work_struct *work)
}
changes = cong_event->state ^ new_cong_state;
- if (!changes)
+ if (!changes) {
+ cong_event->stats.pci_bw_stale_event++;
return;
+ }
cong_event->state = new_cong_state;
@@ -249,8 +248,60 @@ static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb,
return NOTIFY_OK;
}
+static int
+mlx5e_pcie_cong_get_thresh_config(struct mlx5_core_dev *dev,
+ struct mlx5e_pcie_cong_thresh *config)
+{
+ u32 ids[4] = {
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
+ MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
+ };
+ struct devlink *devlink = priv_to_devlink(dev);
+ union devlink_param_value val[4];
+
+ for (int i = 0; i < 4; i++) {
+ u32 id = ids[i];
+ int err;
+
+ err = devl_param_driverinit_value_get(devlink, id, &val[i]);
+ if (err)
+ return err;
+ }
+
+ config->inbound_low = val[0].vu16;
+ config->inbound_high = val[1].vu16;
+ config->outbound_low = val[2].vu16;
+ config->outbound_high = val[3].vu16;
+
+ return 0;
+}
+
+static int
+mlx5e_thresh_config_validate(struct mlx5_core_dev *mdev,
+ const struct mlx5e_pcie_cong_thresh *config)
+{
+ int err = 0;
+
+ if (config->inbound_low >= config->inbound_high) {
+ err = -EINVAL;
+ mlx5_core_err(mdev, "PCIe inbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
+ config->inbound_low, config->inbound_high);
+ }
+
+ if (config->outbound_low >= config->outbound_high) {
+ err = -EINVAL;
+ mlx5_core_err(mdev, "PCIe outbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
+ config->outbound_low, config->outbound_high);
+ }
+
+ return err;
+}
+
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
{
+ struct mlx5e_pcie_cong_thresh thresh_config = {};
struct mlx5e_pcie_cong_event *cong_event;
struct mlx5_core_dev *mdev = priv->mdev;
int err;
@@ -258,6 +309,16 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
if (!mlx5_pcie_cong_event_supported(mdev))
return 0;
+ err = mlx5e_pcie_cong_get_thresh_config(mdev, &thresh_config);
+ if (WARN_ON(err))
+ return err;
+
+ err = mlx5e_thresh_config_validate(mdev, &thresh_config);
+ if (err) {
+ mlx5_core_err(mdev, "PCIe congestion event feature disabled\n");
+ return err;
+ }
+
cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL,
mdev->priv.numa_node);
if (!cong_event)
@@ -269,7 +330,7 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
cong_event->priv = priv;
- err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config,
+ err = mlx5_cmd_pcie_cong_event_set(mdev, &thresh_config,
&cong_event->obj_id);
if (err) {
mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n");