netdev - [net-next 08/17] net/mlx5e: Activate HW multipath and handle port affinity based on FIB events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190301200848.9534-9-saeedm@mellanox.com>
Date:   Fri,  1 Mar 2019 12:08:39 -0800
From:   Saeed Mahameed <saeedm@...lanox.com>
To:     "David S. Miller" <davem@...emloft.net>
Cc:     netdev@...r.kernel.org, Roi Dayan <roid@...lanox.com>,
        Or Gerlitz <ogerlitz@...lanox.com>,
        Saeed Mahameed <saeedm@...lanox.com>
Subject: [net-next 08/17] net/mlx5e: Activate HW multipath and handle port affinity based on FIB events

From: Roi Dayan <roid@...lanox.com>

To support multipath offload we are going to track SW multipath route
and related nexthops. To do that we register to FIB notifier and handle
the route and next-hops events and reflect that as port affinity to HW.

When there is a new multipath route entry that all next-hops are the
ports of an HCA we will activate LAG in HW.

Egress wise, we use HW LAG as the means to emulate multipath on current
HW which doesn't support port selection based on xmit hash. In the
presence of multiple VFs which use multiple SQs (send queues) this
yields fairly good distribution.

HA wise, HW LAG buys us the ability for a given RQ (receive queue) to
receive traffic from both ports and for SQs to migrate xmitting over
the active port if their base port fails.

When the route entry is being updated to single path we will update
the HW port affinity to use that port only.

If a next-hop becomes dead we update the HW port affinity to the living
port.

When all next-hops are alive again we reset the affinity to default.

Due to FW/HW limitations, when a route is deleted we are not disabling
the HW LAG since doing so will not allow us to enable it again while
VFs are bounded. Typically this is just a temporary state when a
routing daemon removes dead routes and later adds them back as needed.

This patch only handles events for AF_INET.

Signed-off-by: Roi Dayan <roid@...lanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@...lanox.com>
Signed-off-by: Saeed Mahameed <saeedm@...lanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |   7 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/lag.c |   8 +
 drivers/net/ethernet/mellanox/mlx5/core/lag.h |   2 +
 .../net/ethernet/mellanox/mlx5/core/lag_mp.c  | 281 ++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lag_mp.h  |  26 ++
 6 files changed, 326 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 84200d93cf9d..d0b28251abf2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2476,3 +2476,10 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
 
 	return false;
 }
+
+bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
+			       struct mlx5_core_dev *dev1)
+{
+	return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
+		dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 5f82e637410b..3f3cd32ae60a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -371,6 +371,8 @@ static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev
 
 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
 			 struct mlx5_core_dev *dev1);
+bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
+			       struct mlx5_core_dev *dev1);
 
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index eff5c54818b7..48aa6e030bcf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -36,6 +36,7 @@
 #include "mlx5_core.h"
 #include "eswitch.h"
 #include "lag.h"
+#include "lag_mp.h"
 
 /* General purpose, use for short periods of time.
  * Beware of lock dependencies (preferably, no locks should be acquired
@@ -559,6 +560,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 {
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
+	int err;
 
 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
 	    !MLX5_CAP_GEN(dev, lag_master) ||
@@ -586,6 +588,11 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
 		}
 	}
+
+	err = mlx5_lag_mp_init(ldev);
+	if (err)
+		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
+			      err);
 }
 
 int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num)
@@ -631,6 +638,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev)
 	if (i == MLX5_MAX_PORTS) {
 		if (ldev->nb.notifier_call)
 			unregister_netdevice_notifier(&ldev->nb);
+		mlx5_lag_mp_cleanup(ldev);
 		cancel_delayed_work_sync(&ldev->bond_work);
 		mlx5_lag_dev_free(ldev);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
index f8bea6ed4285..1dea0b1c9826 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@@ -5,6 +5,7 @@
 #define __MLX5_LAG_H__
 
 #include "mlx5_core.h"
+#include "lag_mp.h"
 
 enum {
 	MLX5_LAG_FLAG_ROCE   = 1 << 0,
@@ -38,6 +39,7 @@ struct mlx5_lag {
 	struct workqueue_struct   *wq;
 	struct delayed_work       bond_work;
 	struct notifier_block     nb;
+	struct lag_mp             lag_mp;
 };
 
 static inline struct mlx5_lag *
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index 2d2861cd4e02..5680beba8c07 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -3,9 +3,18 @@
 
 #include <linux/netdevice.h>
 #include "lag.h"
+#include "lag_mp.h"
 #include "mlx5_core.h"
 #include "eswitch.h"
 
+static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev)
+{
+	if (!ldev->pf[0].dev || !ldev->pf[1].dev)
+		return false;
+
+	return mlx5_esw_multipath_prereq(ldev->pf[0].dev, ldev->pf[1].dev);
+}
+
 static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev)
 {
 	return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH);
@@ -21,3 +30,275 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev)
 
 	return res;
 }
+
+/**
+ * Set lag port affinity
+ *
+ * @ldev: lag device
+ * @port:
+ *     0 - set normal affinity.
+ *     1 - set affinity to port 1.
+ *     2 - set affinity to port 2.
+ *
+ **/
+static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, int port)
+{
+	struct lag_tracker tracker;
+
+	if (!__mlx5_lag_is_multipath(ldev))
+		return;
+
+	switch (port) {
+	case 0:
+		tracker.netdev_state[0].tx_enabled = true;
+		tracker.netdev_state[1].tx_enabled = true;
+		tracker.netdev_state[0].link_up = true;
+		tracker.netdev_state[1].link_up = true;
+		break;
+	case 1:
+		tracker.netdev_state[0].tx_enabled = true;
+		tracker.netdev_state[0].link_up = true;
+		tracker.netdev_state[1].tx_enabled = false;
+		tracker.netdev_state[1].link_up = false;
+		break;
+	case 2:
+		tracker.netdev_state[0].tx_enabled = false;
+		tracker.netdev_state[0].link_up = false;
+		tracker.netdev_state[1].tx_enabled = true;
+		tracker.netdev_state[1].link_up = true;
+		break;
+	default:
+		mlx5_core_warn(ldev->pf[0].dev, "Invalid affinity port %d",
+			       port);
+		return;
+	}
+
+	mlx5_modify_lag(ldev, &tracker);
+}
+
+static void mlx5_lag_fib_event_flush(struct notifier_block *nb)
+{
+	struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
+	struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
+
+	flush_workqueue(ldev->wq);
+}
+
+struct mlx5_fib_event_work {
+	struct work_struct work;
+	struct mlx5_lag *ldev;
+	unsigned long event;
+	union {
+		struct fib_entry_notifier_info fen_info;
+		struct fib_nh_notifier_info fnh_info;
+	};
+};
+
+static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
+				     unsigned long event,
+				     struct fib_info *fi)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	/* Handle delete event */
+	if (event == FIB_EVENT_ENTRY_DEL) {
+		/* stop track */
+		if (mp->mfi == fi)
+			mp->mfi = NULL;
+		return;
+	}
+
+	/* Handle add/replace event */
+	if (fi->fib_nhs == 1) {
+		if (__mlx5_lag_is_active(ldev)) {
+			struct net_device *nh_dev = fi->fib_nh[0].nh_dev;
+			int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
+
+			mlx5_lag_set_port_affinity(ldev, ++i);
+		}
+		return;
+	}
+
+	if (fi->fib_nhs != 2)
+		return;
+
+	/* Verify next hops are ports of the same hca */
+	if (!(fi->fib_nh[0].nh_dev == ldev->pf[0].netdev &&
+	      fi->fib_nh[1].nh_dev == ldev->pf[1].netdev) &&
+	    !(fi->fib_nh[0].nh_dev == ldev->pf[1].netdev &&
+	      fi->fib_nh[1].nh_dev == ldev->pf[0].netdev)) {
+		mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n");
+		return;
+	}
+
+	/* First time we see multipath route */
+	if (!mp->mfi && !__mlx5_lag_is_active(ldev)) {
+		struct lag_tracker tracker;
+
+		tracker = ldev->tracker;
+		mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH);
+	}
+
+	mlx5_lag_set_port_affinity(ldev, 0);
+	mp->mfi = fi;
+}
+
+static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev,
+				       unsigned long event,
+				       struct fib_nh *fib_nh,
+				       struct fib_info *fi)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	/* Check the nh event is related to the route */
+	if (!mp->mfi || mp->mfi != fi)
+		return;
+
+	/* nh added/removed */
+	if (event == FIB_EVENT_NH_DEL) {
+		int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev);
+
+		if (i >= 0) {
+			i = (i + 1) % 2 + 1; /* peer port */
+			mlx5_lag_set_port_affinity(ldev, i);
+		}
+	} else if (event == FIB_EVENT_NH_ADD &&
+		   fi->fib_nhs == 2) {
+		mlx5_lag_set_port_affinity(ldev, 0);
+	}
+}
+
+static void mlx5_lag_fib_update(struct work_struct *work)
+{
+	struct mlx5_fib_event_work *fib_work =
+		container_of(work, struct mlx5_fib_event_work, work);
+	struct mlx5_lag *ldev = fib_work->ldev;
+	struct fib_nh *fib_nh;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		mlx5_lag_fib_route_event(ldev, fib_work->event,
+					 fib_work->fen_info.fi);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fib_nh = fib_work->fnh_info.fib_nh;
+		mlx5_lag_fib_nexthop_event(ldev,
+					   fib_work->event,
+					   fib_work->fnh_info.fib_nh,
+					   fib_nh->nh_parent);
+		fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static struct mlx5_fib_event_work *
+mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event)
+{
+	struct mlx5_fib_event_work *fib_work;
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (WARN_ON(!fib_work))
+		return NULL;
+
+	INIT_WORK(&fib_work->work, mlx5_lag_fib_update);
+	fib_work->ldev = ldev;
+	fib_work->event = event;
+
+	return fib_work;
+}
+
+static int mlx5_lag_fib_event(struct notifier_block *nb,
+			      unsigned long event,
+			      void *ptr)
+{
+	struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
+	struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
+	struct fib_notifier_info *info = ptr;
+	struct mlx5_fib_event_work *fib_work;
+	struct fib_entry_notifier_info *fen_info;
+	struct fib_nh_notifier_info *fnh_info;
+	struct fib_info *fi;
+
+	if (info->family != AF_INET)
+		return NOTIFY_DONE;
+
+	if (!mlx5_lag_multipath_check_prereq(ldev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen_info = container_of(info, struct fib_entry_notifier_info,
+					info);
+		fi = fen_info->fi;
+		if (fi->fib_dev != ldev->pf[0].netdev &&
+		    fi->fib_dev != ldev->pf[1].netdev) {
+			return NOTIFY_DONE;
+		}
+		fib_work = mlx5_lag_init_fib_work(ldev, event);
+		if (!fib_work)
+			return NOTIFY_DONE;
+		fib_work->fen_info = *fen_info;
+		/* Take reference on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fnh_info = container_of(info, struct fib_nh_notifier_info,
+					info);
+		fib_work = mlx5_lag_init_fib_work(ldev, event);
+		if (!fib_work)
+			return NOTIFY_DONE;
+		fib_work->fnh_info = *fnh_info;
+		fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	queue_work(ldev->wq, &fib_work->work);
+
+	return NOTIFY_DONE;
+}
+
+int mlx5_lag_mp_init(struct mlx5_lag *ldev)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+	int err;
+
+	if (mp->fib_nb.notifier_call)
+		return 0;
+
+	mp->fib_nb.notifier_call = mlx5_lag_fib_event;
+	err = register_fib_notifier(&mp->fib_nb,
+				    mlx5_lag_fib_event_flush);
+	if (err)
+		mp->fib_nb.notifier_call = NULL;
+
+	return err;
+}
+
+void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	if (!mp->fib_nb.notifier_call)
+		return;
+
+	unregister_fib_notifier(&mp->fib_nb);
+	mp->fib_nb.notifier_call = NULL;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h
new file mode 100644
index 000000000000..6d14b1100be9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_LAG_MP_H__
+#define __MLX5_LAG_MP_H__
+
+#include "lag.h"
+#include "mlx5_core.h"
+
+struct lag_mp {
+	struct notifier_block     fib_nb;
+	struct fib_info           *mfi; /* used in tracking fib events */
+};
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+int mlx5_lag_mp_init(struct mlx5_lag *ldev);
+void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev);
+
+#else /* CONFIG_MLX5_ESWITCH */
+
+static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; }
+static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+#endif /* __MLX5_LAG_MP_H__ */
-- 
2.20.1