netdev - [PATCH v2 2/2] bonding: add multi-link mode

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1292540003-9465-3-git-send-email-fubar@us.ibm.com>
Date:	Thu, 16 Dec 2010 14:53:23 -0800
From:	Jay Vosburgh <fubar@...ibm.com>
To:	netdev@...r.kernel.org
Cc:	Andy Gospodarek <andy@...yhouse.net>
Subject: [PATCH v2 2/2] bonding: add multi-link mode

Adds multi-link mode for bonding.

	This mode performs per-subnet balancing, wherein each slave is
typically a member of a discrete IP subnet, and the multi-link (ML)
addresses exist in a subnet of their own.  A user space daemon runs the
ML discovery protocol, which locates other ML hosts and exchanges link
information.  The daemon then informs bonding of the appropriate set of
slaves to reach a particular ML destination.  The ML daemon also monitors
the links to insure continued availabilty.

	Note that ML slaves maintain their assigned IP addresses, and
may operate outside the scope of the bond.
---
 drivers/net/bonding/Makefile    |    3 +-
 drivers/net/bonding/bond_main.c |   34 ++-
 drivers/net/bonding/bond_ml.c   |  670 +++++++++++++++++++++++++++++++++++++++
 drivers/net/bonding/bond_ml.h   |   94 ++++++
 drivers/net/bonding/bonding.h   |   13 +
 include/linux/if.h              |    1 +
 include/linux/if_bonding.h      |   15 +
 net/core/dev.c                  |   37 ++-
 8 files changed, 855 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/bonding/bond_ml.c
 create mode 100644 drivers/net/bonding/bond_ml.h

diff --git a/drivers/net/bonding/Makefile b/drivers/net/bonding/Makefile
index 26848a2..0b6ed50 100644
--- a/drivers/net/bonding/Makefile
+++ b/drivers/net/bonding/Makefile
@@ -4,7 +4,8 @@
 
 obj-$(CONFIG_BONDING) += bonding.o
 
-bonding-objs := bond_main.o bond_3ad.o bond_alb.o bond_sysfs.o bond_netlink.o
+bonding-objs := bond_main.o bond_3ad.o bond_alb.o bond_sysfs.o bond_netlink.o \
+	bond_ml.o
 
 ipv6-$(subst m,y,$(CONFIG_IPV6)) += bond_ipv6.o
 bonding-objs += $(ipv6-y)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4d3a2c8..1399949 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -200,6 +200,7 @@ const struct bond_parm_tbl bond_mode_tbl[] = {
 {	"802.3ad",		BOND_MODE_8023AD},
 {	"balance-tlb",		BOND_MODE_TLB},
 {	"balance-alb",		BOND_MODE_ALB},
+{	"multi-link",		BOND_MODE_ML},
 {	NULL,			-1},
 };
 
@@ -257,9 +258,10 @@ static const char *bond_mode_name(int mode)
 		[BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
 		[BOND_MODE_TLB] = "transmit load balancing",
 		[BOND_MODE_ALB] = "adaptive load balancing",
+		[BOND_MODE_ML] = "multi-link",
 	};
 
-	if (mode < 0 || mode > BOND_MODE_ALB)
+	if (mode < 0 || mode > BOND_MODE_ML)
 		return "unknown";
 
 	return names[mode];
@@ -1603,7 +1605,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 */
 	memcpy(new_slave->perm_hwaddr, slave_dev->dev_addr, ETH_ALEN);
 
-	if (!bond->params.fail_over_mac) {
+	if (!bond->params.fail_over_mac && bond->params.mode != BOND_MODE_ML) {
 		/*
 		 * Set slave to master's mac address.  The application already
 		 * set the master's mac address to that of the first slave
@@ -2097,6 +2099,9 @@ static int bond_release_all(struct net_device *bond_dev)
 		if (bond->params.mode == BOND_MODE_8023AD)
 			bond_3ad_unbind_slave(slave);
 
+		if (bond->params.mode == BOND_MODE_ML)
+			bond_ml_unbind_slave(bond, slave);
+
 		slave_dev = slave->dev;
 		bond_detach_slave(bond, slave);
 
@@ -3357,6 +3362,8 @@ static void bond_info_show_master(struct seq_file *seq)
 			seq_printf(seq, "\tPartner Mac Address: %pM\n",
 				   ad_info.partner_system);
 		}
+	} else if (bond->params.mode == BOND_MODE_ML) {
+		bond_ml_show_proc(seq, bond);
 	}
 }
 
@@ -3843,6 +3850,11 @@ static int bond_open(struct net_device *bond_dev)
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
+	if (bond->params.mode == BOND_MODE_ML) {
+		INIT_DELAYED_WORK(&bond->ml_work, bond_ml_monitor);
+		queue_delayed_work(bond->wq, &bond->ml_work, 0);
+	}
+
 	return 0;
 }
 
@@ -3884,6 +3896,9 @@ static int bond_close(struct net_device *bond_dev)
 	case BOND_MODE_ALB:
 		cancel_delayed_work(&bond->alb_work);
 		break;
+	case BOND_MODE_ML:
+		cancel_delayed_work(&bond->ml_work);
+		break;
 	default:
 		break;
 	}
@@ -4602,6 +4617,8 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	case BOND_MODE_ALB:
 	case BOND_MODE_TLB:
 		return bond_alb_xmit(skb, dev);
+	case BOND_MODE_ML:
+		return bond_xmit_ml(skb, dev);
 	default:
 		/* Should never happen, mode already checked */
 		pr_err("%s: Error: Unknown bonding mode %d\n",
@@ -4639,6 +4656,11 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
 		/* FALLTHRU */
 	case BOND_MODE_TLB:
 		break;
+	case BOND_MODE_ML:
+		bond_set_xmit_hash_policy(bond);
+		bond_set_master_ml_flags(bond);
+		bond_ml_init(bond);
+		break;
 	default:
 		/* Should never happen, mode already checked */
 		pr_err("%s: Error: Unknown bonding mode %d\n",
@@ -4713,7 +4735,6 @@ void bond_setup(struct net_device *bond_dev)
 	ether_setup(bond_dev);
 	bond_dev->netdev_ops = &bond_netdev_ops;
 	bond_dev->ethtool_ops = &bond_ethtool_ops;
-	bond_set_mode_ops(bond, bond->params.mode);
 
 	bond_dev->destructor = bond_destructor;
 
@@ -4726,6 +4747,8 @@ void bond_setup(struct net_device *bond_dev)
 	if (bond->params.arp_interval)
 		bond_dev->priv_flags |= IFF_MASTER_ARPMON;
 
+	bond_set_mode_ops(bond, bond->params.mode);
+
 	/* At first, we block adding VLANs. That's the only way to
 	 * prevent problems that occur when adding VLANs over an
 	 * empty bond. The block will be removed once non-challenged
@@ -4773,6 +4796,10 @@ static void bond_work_cancel_all(struct bonding *bond)
 	    delayed_work_pending(&bond->ad_work))
 		cancel_delayed_work(&bond->ad_work);
 
+	if (bond->params.mode == BOND_MODE_ML &&
+	    delayed_work_pending(&bond->ml_work))
+		cancel_delayed_work(&bond->ml_work);
+
 	if (delayed_work_pending(&bond->mcast_work))
 		cancel_delayed_work(&bond->mcast_work);
 }
@@ -4858,6 +4885,7 @@ static int bond_check_params(struct bond_params *params)
 
 	if (xmit_hash_policy) {
 		if ((bond_mode != BOND_MODE_XOR) &&
+		    (bond_mode != BOND_MODE_ML) &&
 		    (bond_mode != BOND_MODE_8023AD)) {
 			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
 			       bond_mode_name(bond_mode));
diff --git a/drivers/net/bonding/bond_ml.c b/drivers/net/bonding/bond_ml.c
new file mode 100644
index 0000000..3cfe518
--- /dev/null
+++ b/drivers/net/bonding/bond_ml.c
@@ -0,0 +1,670 @@
+/*
+ * Multi-link mode support for bonding
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2010
+ *
+ * Author: Jay Vosburgh <fubar@...ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_bonding.h>
+#include <linux/in.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/genetlink.h>
+
+#include "bonding.h"
+
+extern struct genl_family bond_genl_family;
+extern struct genl_multicast_group bond_genl_mcgrp;
+extern int bond_nl_seq;
+
+static u32 bond_ml_salt __read_mostly;
+
+static inline int bond_ml_hash(const __be32 mladdr)
+{
+	return jhash_1word(mladdr, bond_ml_salt) & (BOND_ML_HASH_SZ - 1);
+}
+
+/*
+ * Create new ml_route entry, insert into hash table.
+ *
+ * Caller holds bond->lock for write.
+ */
+static struct ml_route *bond_mlr_create(struct bonding *bond, __be32 mladdr)
+{
+	struct ml_route *mlr, *head;
+	int hash;
+
+	mlr = kzalloc(sizeof(*mlr), GFP_ATOMIC);
+	if (!mlr)
+		return NULL;
+
+	mlr->state = MLRT_EMPTY;
+	hash = bond_ml_hash(mladdr);
+
+	head = bond->ml_info.ml_rtable[hash];
+	mlr->next = head;
+	bond->ml_info.ml_rtable[hash] = mlr;
+
+	return mlr;
+}
+
+/*
+ * Destroy ml_route entry.  Remove from hash table if necessary, then free.
+ * Caller responsible for freeing ml_dest table.
+ *
+ * Caller holds bond->lock for write.
+ */
+static void bond_mlr_destroy(struct bonding *bond, struct ml_route *mlr)
+{
+	struct ml_route *mlr_prev;
+	int hash;
+
+	printk("bmd: mlr %p n %p\n", mlr, mlr->next);
+
+	/* XXX - cumbersome; rework with struct ml_route ** */
+
+	hash = bond_ml_hash(mlr->ml_ipaddr.addr.s_addr);
+	pr_debug("bmd: ip %x h %x rt[h] %p \n", mlr->ml_ipaddr.addr.s_addr,
+		 hash, bond->ml_info.ml_rtable[hash]);
+
+	if (bond->ml_info.ml_rtable[hash] == mlr) {
+		bond->ml_info.ml_rtable[hash] = mlr->next;
+		goto out;
+	}
+
+	mlr_prev = bond->ml_info.ml_rtable[hash];
+	while (mlr_prev) {
+		if (mlr_prev->next == mlr) {
+			mlr_prev->next = mlr->next;
+			goto out;
+		}
+	}
+
+	pr_err("%s: bond_mlr_destroy: mlr %p has next, but not in table\n",
+	       bond->dev->name, mlr);
+
+out:
+	kfree(mlr);
+}
+
+/*
+ * Look up ml_route entry for supplied ML IP address.
+ *
+ * Caller holds bond->lock for read or better.
+ */
+static struct ml_route *bond_ml_route_output(struct bonding *bond, __be32 mladdr)
+{
+	struct ml_route *mlr;
+	int hash;
+
+	hash = bond_ml_hash(mladdr);
+	mlr = bond->ml_info.ml_rtable[hash];
+
+	while (mlr) {
+		if (mlr->state == MLRT_COMPLETE &&
+		    mlr->ml_ipaddr.addr.s_addr == mladdr)
+			return mlr;
+		mlr = mlr->next;
+	}
+
+	return NULL;
+}
+
+/*
+ * Find "nth" ml_dest in supplied ml_route, where nth is zero-based.  Used
+ * by TX to find suitable slave to send on.  N must be less than
+ * mlr->num_dest.
+ */
+static struct ml_dest *bond_mlr_dest_output(struct ml_route *mlr, int nth)
+{
+	int b;
+
+	b = find_next_bit(&mlr->ml_dest_map, BOND_ML_NDEST, 0);
+	while (nth--) {
+		b = find_next_bit(&mlr->ml_dest_map, BOND_ML_NDEST, b + 1);
+	}
+
+	return mlr->ml_dest[b];
+}
+
+/*
+ * Find ml_dest in supplied ml_route.  Also match against laddr or raddr
+ * if nonzero.
+ */
+static struct ml_dest *bond_mlr_dest_find(struct ml_route *mlr, __be32 laddr, __be32 raddr)
+{
+	struct ml_dest *mld;
+	int i;
+
+/* XXX use bitmap for testing for in-use, limit size of loop */
+	for (i = 0; i < BOND_ML_NDEST; i++) {
+		mld = mlr->ml_dest[i];
+		if (!mld)
+			continue;
+		if (laddr && (laddr != mld->laddr))
+			continue;
+		if (raddr && (raddr != mld->raddr))
+			continue;
+
+		return mld;
+	}
+	return NULL;
+}
+
+static void bond_mlr_dest_free(struct bonding *bond, struct ml_route *mlr, struct ml_dest *mld)
+{
+	int i;
+
+	pr_debug("dest_free: s %s l %pI4 r %pI4 ml %pI4\n",
+		 mld->slave->dev->name, &mld->laddr, &mld->raddr,
+		 &mlr->ml_ipaddr.addr);
+
+	for (i = 0; i < BOND_ML_NDEST; i++) {
+		if (mlr->ml_dest[i] == mld)
+			break;
+	}
+
+	if (i == BOND_ML_NDEST) {
+		pr_debug("bond_mlr_dest_free: mld not found in mlr\n");
+		return;
+	}
+
+	mlr->ml_dest[i] = NULL;
+	mlr->num_dest--;
+
+	if (mld->neigh)
+		neigh_release(mld->neigh);
+
+	mld->magic = 0x0bad0bad;
+	kfree(mld);
+
+	clear_bit(i, &mlr->ml_dest_map);
+	if (mlr->ml_dest_map)
+		return;
+
+	mlr->state = MLRT_INCOMPLETE;
+//	mlr->ml_ipaddr.addr.s_addr = INADDR_ANY;
+	mlr->ml_ipaddr.flag = MLDD_IF_DOWN;
+}
+
+static struct ml_dest *bond_mlr_dest_new(struct ml_route *mlr)
+{
+	struct ml_dest *mld;
+	int n;
+
+	n = find_first_zero_bit(&mlr->ml_dest_map, BOND_ML_NDEST);
+	if (n == BOND_ML_NDEST)
+		return NULL;
+
+	mld = kzalloc(sizeof(*mld), GFP_ATOMIC);
+	if (!mld)
+		return NULL;
+
+	set_bit(n, &mlr->ml_dest_map);
+	mld->magic = BOND_MLD_MAGIC;
+
+	mlr->num_dest++;
+	mlr->ml_dest[n] = mld;
+	return mld;
+}
+
+int bond_ml_delrt(struct bonding *bond, struct in_addr laddr, struct in_addr raddr, struct in_addr mladdr, struct slave *slave)
+{
+	struct ml_route *mlr;
+	struct ml_dest *mld;
+	int rv = 0;
+
+	pr_debug("ml_delrt: l %pI4 r %pI4 ml %pI4\n", &laddr, &raddr, &mladdr);
+	write_lock_bh(&bond->lock);
+
+	mlr = bond_ml_route_output(bond, mladdr.s_addr);
+	if (!mlr) {
+		rv = -ENOENT;
+		goto out;
+	}
+	mld = bond_mlr_dest_find(mlr, laddr.s_addr, raddr.s_addr);
+	if (!mld) {
+		rv = -ENOENT;
+		goto out;
+	}
+
+	bond_mlr_dest_free(bond, mlr, mld);
+
+out:
+	write_unlock_bh(&bond->lock);
+	return rv;
+}
+
+int bond_ml_addrt(struct bonding *bond, struct in_addr laddr, struct in_addr raddr, struct in_addr mladdr, struct slave *slave)
+{
+	struct ml_route *mlr;
+	struct ml_dest *mld;
+	struct neighbour *n;
+	int rv = 0, alloc_mlr = 0;
+
+	pr_debug("ml_addrt: %s l %pI4 r %pI4 m %pI4 s %s\n", bond->dev->name,
+		 &laddr, &raddr, &mladdr, slave->dev->name);
+
+	write_lock_bh(&bond->lock);
+
+	mlr = bond_ml_route_output(bond, mladdr.s_addr);
+	if (mlr) {
+		mld = bond_mlr_dest_find(mlr, laddr.s_addr, raddr.s_addr);
+		if (mld) {
+			rv = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (!mlr) {
+		mlr = bond_mlr_create(bond, mladdr.s_addr);
+		if (!mlr) {
+			rv = -ENOMEM;
+			goto out;
+		}
+		alloc_mlr++;
+	}
+
+	mld = bond_mlr_dest_new(mlr);
+	if (!mld) {
+		rv = -ENOSPC;
+		goto out;
+	}
+
+	mld->slave = bond_get_slave_by_dev(bond, slave->dev);
+	if (!mld->slave) {
+		pr_debug("%s: %s not slave\n", bond->dev->name,
+			 slave->dev->name);
+		rv = -EINVAL;
+		goto out;
+	}
+	
+	mld->laddr = laddr.s_addr;
+	mld->raddr = raddr.s_addr;
+
+	n = __neigh_lookup(&arp_tbl, &mld->raddr, mld->slave->dev, 1);
+	if (!n) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	n->used = jiffies;
+	neigh_event_send(n, NULL);
+	mld->neigh = n;
+
+	mlr->state = MLRT_COMPLETE;
+	mlr->ml_ipaddr.addr.s_addr = mladdr.s_addr;
+	mlr->ml_ipaddr.flag = MLDD_IF_UP;
+
+out:
+	if (rv && alloc_mlr)
+		bond_mlr_destroy(bond, mlr);
+
+	write_unlock_bh(&bond->lock);
+	return rv;
+}
+
+void bond_ml_rt_flush(struct bonding *bond)
+{
+	int i, j;
+	struct ml_route *mlr, *next;
+	struct ml_dest *mld;
+
+	write_lock_bh(&bond->lock);
+
+/* XXX use list_entry vs. mlr->next; make ml_rtable into hash bucket headers */
+	for (i = 0; i < BOND_ML_HASH_SZ; i++) {
+		mlr = bond->ml_info.ml_rtable[i];
+
+		while (mlr) {
+			for (j = 0; j < BOND_ML_NDEST; j++) {
+				mld = mlr->ml_dest[j];
+				if (mld)
+					bond_mlr_dest_free(bond, mlr, mld);
+			}
+
+			next = mlr->next;
+			bond_mlr_destroy(bond, mlr);
+			mlr = next;
+		}
+	}
+
+/* XXX debug verification */
+	for (i = 0; i < BOND_ML_HASH_SZ; i++) {
+		mlr = bond->ml_info.ml_rtable[i];
+
+		if (mlr)
+			printk("bmrf: BAD: hash %d !NULL %p\n", i, mlr);
+	}
+/* XXX end verification */
+
+	write_unlock_bh(&bond->lock);
+}
+
+
+/*
+ * Send DISCOVERY message to daemon
+ *
+ * For DISCOVERY, MLADDR is the remote MLADDR we need to resolve.
+ */
+static int bond_ml_discovery(struct bonding *bond, __be32 mladdr)
+{
+	struct sk_buff *skb;
+	void *msg;
+	int rv;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	msg = genlmsg_put(skb, 0, bond_nl_seq++, &bond_genl_family, 0,
+			  BOND_GENL_ML_CMD_DISCOVERY);
+	if (!msg)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, BOND_GENL_ATTR_ML_MLADDR, mladdr);
+	NLA_PUT_U32(skb, BOND_GENL_ATTR_MASTER_INDEX, bond->dev->ifindex);
+
+	rv = genlmsg_end(skb, msg);
+	if (rv < 0)
+		goto nla_put_failure;
+
+	return genlmsg_multicast(skb, 0, bond_genl_mcgrp.id, GFP_ATOMIC);
+
+nla_put_failure:
+	nlmsg_free(skb);
+	return -EMSGSIZE;
+}
+
+/*
+ * Look up skb's IP destination in ML route table
+ * If exists, send the packet via the found ML destination
+ * If not, initiate ML discovery
+ */
+int bond_xmit_ml(struct sk_buff *skb, struct net_device *bond_dev)
+{
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct ml_route *mlr;
+	struct ml_dest *mld;
+	struct iphdr *iph;
+	struct neighbour *n;
+	struct net_device *slave_dev;
+	int rv = 1;
+	int sl;
+
+	read_lock(&bond->lock);
+
+	if (!BOND_IS_OK(bond))
+		goto out;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		iph = ip_hdr(skb);
+		if (!iph) {
+			pr_debug("b_x_ml: no iph\n");
+			goto out;
+		}
+
+		mlr = bond_ml_route_output(bond, iph->daddr);
+		if (!mlr) {
+			rv = bond_ml_discovery(bond, iph->daddr);
+			pr_debug("b_x_ml: %s disco s %pI4 d %pI4 rv %d\n",
+				 bond->dev->name, &iph->saddr, &iph->daddr, rv);
+			goto out;
+		}
+
+		sl = bond->xmit_hash_policy(skb, mlr->num_dest);
+		mld = bond_mlr_dest_output(mlr, sl);
+		if (!mld) {
+			pr_debug("b_x_ml: no mld sl %d n_d %d\n", sl,
+				 mlr->num_dest);
+			goto out;
+		}
+		if (!mld->slave) {
+			pr_debug("b_x_ml: no slave\n");
+			goto out;
+		}
+
+		n = mld->neigh;
+		if (n) {
+			slave_dev = mld->slave->dev;
+			rv = dev_hard_header(skb, slave_dev,
+					     ntohs(skb->protocol), n->ha,
+					     slave_dev->dev_addr, skb->len);
+		} else {
+			pr_debug("b_x_ml: no n\n");
+		}
+
+		rv = bond_dev_queue_xmit(bond, skb, mld->slave->dev);
+		break;
+
+	case htons(ETH_P_ARP):
+		pr_debug("b_x_ml: UNEXPECTED ARP\n");
+		break;
+
+	default:
+		rv = bond_dev_queue_xmit(bond, skb, bond->first_slave->dev);
+		break;
+	}
+
+out:
+	read_unlock(&bond->lock);
+	if (rv) {
+		pr_debug("xmit_ml rv %d\n", rv);
+		dev_kfree_skb(skb);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static char *mlr_state_nm(int s)
+{
+	switch (s) {
+	case MLRT_COMPLETE:
+		return "C";
+	case MLRT_INCOMPLETE:
+		return "I";
+	case MLRT_EMPTY:
+		return "E";
+	default:
+		return "?";
+	}
+}
+
+static char *mlr_ipaddr_flag_nm(int f)
+{
+	switch (f) {
+	case MLDD_IF_UP:
+		return "UP";
+	case MLDD_IF_DOWN:
+		return "DN";
+	default:
+		return "??";
+	}
+}
+
+void bond_ml_show_proc_mlr(struct seq_file *seq, struct ml_route *mlr)
+{
+	struct ml_dest *mld;
+	int j;
+
+	for (j = 0; j < BOND_ML_NDEST; j++) {
+		mld = mlr->ml_dest[j];
+		if (mld)
+			seq_printf(seq, "   D %02d s %s l %pI4 r %pI4\n",
+				   j, mld->slave->dev->name,
+				   &mld->laddr, &mld->raddr);
+	}
+}
+
+void bond_ml_show_proc(struct seq_file *seq, struct bonding *bond)
+{
+	struct ml_route *mlr;
+	int i;
+
+	read_lock(&bond->lock);
+
+	for (i = 0; i < BOND_ML_HASH_SZ; i++) {
+		mlr = bond->ml_info.ml_rtable[i];
+
+		while (mlr) {
+			seq_printf(seq, "%02d s %s ndest %d ml_i: f %s %pI4\n",
+				   i, mlr_state_nm(mlr->state), mlr->num_dest,
+				   mlr_ipaddr_flag_nm(mlr->ml_ipaddr.flag),
+				   &mlr->ml_ipaddr.addr.s_addr);
+
+			if (mlr->state == MLRT_COMPLETE)
+				bond_ml_show_proc_mlr(seq, mlr);
+
+			mlr = mlr->next;
+		}
+	}
+
+	read_unlock(&bond->lock);
+}
+
+static const int ml_delta_in_ticks = HZ * 10;
+
+/*
+ * ML periodic monitor
+ *
+ * Walk the ML routing table.  For each entry, check its state.  Insure
+ * that ARP entries for ML routing entries are kept up to date.
+ */
+void bond_ml_monitor(struct work_struct *work)
+{
+	struct bonding *bond = container_of(work, struct bonding,
+					    ml_work.work);
+	struct ml_route *mlr;
+	struct ml_dest *mld;
+	struct neighbour *n;
+	int i, j, rv;
+
+	read_lock(&bond->lock);
+
+	if (bond->kill_timers)
+		goto out;
+
+	for (i = 0; i < BOND_ML_HASH_SZ; i++) {
+		mlr = bond->ml_info.ml_rtable[i];
+
+		while (mlr) {
+			if (mlr->state == MLRT_EMPTY) {
+				mlr = mlr->next;
+				continue;
+			}
+
+			for (j = 0; j < BOND_ML_NDEST; j++) {
+				mld = mlr->ml_dest[j];
+				if (!mld)
+					break;
+
+
+if (mld->magic != BOND_MLD_MAGIC) {
+	printk("bmm: bad magic %x s %p n %p l %x r %x\n", mld->magic,
+	       mld->slave, mld->neigh, mld->laddr, mld->raddr);
+	continue;
+}
+				n = __neigh_lookup(&arp_tbl, &mld->raddr,
+						   mld->slave->dev, 1);
+				if (n) {
+					n->used = jiffies;
+					rv = neigh_event_send(n, NULL);
+					neigh_release(n);
+				} else {
+					pr_debug("bmm: no n r %pI4 s %s\n",
+						 &mld->raddr,
+						 mld->slave->dev->name);
+				}
+			}
+
+			mlr = mlr->next;
+		}
+	}
+
+	queue_delayed_work(bond->wq, &bond->ml_work, ml_delta_in_ticks);
+out:
+	read_unlock(&bond->lock);
+}
+
+/*
+ * Use a limited set of header_ops.  At packet transmit time, we'll use
+ * the selected slave's ops to fill in the hard_header.
+ */
+static const struct header_ops bond_ml_header_ops = {
+	.create		= NULL,
+	.rebuild	= eth_rebuild_header,
+	.parse		= eth_header_parse,
+	.cache		= NULL,
+	.cache_update	= NULL,
+};
+
+
+/*
+ * XXX use neigh->arp_queue to queue packets while discovery takes place
+ * Requires neigh_ops for ML.
+ * .solicit == discovery ?
+ */
+
+//static struct neigh_table bond_ml_tbl = {
+//};
+
+
+/*
+ * called with bond->lock held for write
+ */
+void bond_ml_unbind_slave(struct bonding *bond, struct slave *slave)
+{
+	struct ml_route *mlr;
+	struct ml_dest *mld;
+	int i, j;
+
+	for (i = 0; i < BOND_ML_HASH_SZ; i++) {
+		mlr = bond->ml_info.ml_rtable[i];
+
+		while (mlr) {
+			for (j = 0; j < BOND_ML_NDEST; j++) {
+				mld = mlr->ml_dest[j];
+				if (mld && mld->slave == slave)
+					bond_mlr_dest_free(bond, mlr, mld);
+			}
+			mlr = mlr->next;
+		}
+	}
+}
+
+void bond_ml_init(struct bonding *bond)
+{
+	struct net_device *bond_dev = bond->dev;
+
+	memset(&bond->ml_info, 0, sizeof(bond->ml_info));
+
+	bond_dev->flags |= IFF_NOARP;
+	bond_dev->flags &= ~(IFF_MULTICAST | IFF_BROADCAST);
+	bond_dev->header_ops = &bond_ml_header_ops;
+
+	get_random_bytes(&bond_ml_salt, sizeof(bond_ml_salt));
+}
diff --git a/drivers/net/bonding/bond_ml.h b/drivers/net/bonding/bond_ml.h
new file mode 100644
index 0000000..d5c4f6e
--- /dev/null
+++ b/drivers/net/bonding/bond_ml.h
@@ -0,0 +1,94 @@
+/*
+ *
+ */
+#ifndef __BOND_ML_H__
+#define __BOND_ML_H__
+
+#define MLDD_IF_DOWN	0xc0
+#define MLDD_IF_UP	0xc1
+
+struct ml_ipaddr {
+	u8 ip_version;
+	u8 flag;
+	u16 tick;
+	struct in_addr addr;
+};
+
+#define MLDD_BCAST_REPLY	0xf0
+#define MLDD_UCAST_REPLY	0xf1
+#define MLDD_REQUEST		0xf2
+#define MLDD_LOOKUP		0xf3
+
+struct ml_msg {
+	u8 version;
+	u8 op;
+	u16 reserved1;
+	u32 num;
+	s32 request_index;
+	s32 reply_index;
+	struct ml_ipaddr ml_ipaddr;
+	u16 req_net;
+	u16 rep_net;
+};
+
+#define BOND_MLD_MAGIC	0xfeedfeed
+
+struct ml_dest {
+	u32 magic;
+	struct slave *slave;
+	struct neighbour *neigh;
+	__be32 laddr;
+	__be32 raddr;
+};
+
+#define MLRT_COMPLETE	0xa0
+#define MLRT_INCOMPLETE 0xa1
+#define MLRT_EMPTY	0xa2
+
+/*
+ * The ML protocol is limited to 16 destinations per ML route.
+ */
+#define BOND_ML_NDEST 16
+
+/*
+ * An ML route contains one peer IP address, the "ML IP" address of the
+ * peer system.  Within that route are one or more destination entries
+ * that specify the various possible paths to reach the ML IP peer.  Each
+ * destination entry includes the local slave and the peer interface IP
+ * address at the destination.
+ */
+struct ml_route {
+	struct ml_route *next;
+	u16 state;
+//	u16 index;
+	struct ml_ipaddr ml_ipaddr;
+	int num_dest;
+	unsigned long ml_dest_map;
+	struct ml_dest *ml_dest[BOND_ML_NDEST];
+//	unsigned long ml_inactive_map;
+//	struct ml_dest *ml_inactive[LOCAL_IF_MAX];
+};
+
+/*
+ * Hash by ML IP address
+ */
+#define BOND_ML_HASH_SZ		31
+
+struct ml_bond_info {
+	struct ml_route *ml_rtable[BOND_ML_HASH_SZ];
+};
+
+extern int bond_xmit_ml(struct sk_buff *skb, struct net_device *bond_dev);
+extern int bond_ml_changelink(struct bonding *bond, struct bond_ml_route *bmr);
+extern void bond_ml_monitor(struct work_struct *work);
+extern void bond_ml_show_proc(struct seq_file *, struct bonding *);
+extern void bond_ml_init(struct bonding *);
+extern int bond_ml_addrt(struct bonding *, struct in_addr, struct in_addr,
+			 struct in_addr, struct slave *);
+extern int bond_ml_delrt(struct bonding *, struct in_addr, struct in_addr,
+			 struct in_addr, struct slave *);
+extern void bond_ml_unbind_slave(struct bonding *bond, struct slave *slave);
+extern void bond_ml_rt_flush(struct bonding *bond);
+
+
+#endif /* __BOND_ML_H__ */
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index db7bb06..13b9dd5 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -23,6 +23,7 @@
 #include <linux/in6.h>
 #include "bond_3ad.h"
 #include "bond_alb.h"
+#include "bond_ml.h"
 
 #define DRV_VERSION	"3.7.0"
 #define DRV_RELDATE	"June 2, 2010"
@@ -246,6 +247,7 @@ struct bonding {
 	u16      rr_tx_counter;
 	struct   ad_bond_info ad_info;
 	struct   alb_bond_info alb_info;
+	struct   ml_bond_info ml_info;
 	struct   bond_params params;
 	struct   list_head vlan_list;
 	struct   vlan_group *vlgrp;
@@ -255,6 +257,7 @@ struct bonding {
 	struct   delayed_work arp_work;
 	struct   delayed_work alb_work;
 	struct   delayed_work ad_work;
+	struct   delayed_work ml_work;
 	struct   delayed_work mcast_work;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct   in6_addr master_ipv6;
@@ -361,6 +364,16 @@ static inline void bond_unset_master_alb_flags(struct bonding *bond)
 	bond->dev->priv_flags &= ~IFF_MASTER_ALB;
 }
 
+static inline void bond_set_master_ml_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags |= IFF_MASTER_ML;
+}
+
+static inline void bond_unset_master_ml_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags &= ~IFF_MASTER_ML;
+}
+
 struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr);
 int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev);
 int bond_create(struct net *net, const char *name);
diff --git a/include/linux/if.h b/include/linux/if.h
index 1239599..826b06f 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -77,6 +77,7 @@
 #define IFF_BRIDGE_PORT	0x8000		/* device used as bridge port */
 #define IFF_OVS_DATAPATH	0x10000	/* device used as Open vSwitch
 					 * datapath port */
+#define IFF_MASTER_ML	0x20000		/* bonding master, multi-link */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h
index b03d832..15c8773 100644
--- a/include/linux/if_bonding.h
+++ b/include/linux/if_bonding.h
@@ -70,6 +70,7 @@
 #define BOND_MODE_8023AD        4
 #define BOND_MODE_TLB           5
 #define BOND_MODE_ALB		6 /* TLB + RLB (receive load balancing) */
+#define BOND_MODE_ML		7
 
 /* each slave's link has 4 states */
 #define BOND_LINK_UP    0           /* link is up and running */
@@ -114,12 +115,22 @@ struct ad_info {
 	__u8 partner_system[ETH_ALEN];
 };
 
+struct bond_ml_route {
+	__u16 lif_index;
+	struct in_addr laddr;
+	struct in_addr raddr;
+};
+
 enum {
 	BOND_GENL_ATTR_UNSPEC = 0,
 	BOND_GENL_ATTR_MASTER_INDEX,
 	BOND_GENL_ATTR_SLAVE_INDEX,
 	BOND_GENL_ATTR_MODE,
 	BOND_GENL_ATTR_SLAVE_LINK,
+	BOND_GENL_ATTR_ML_LADDR,
+	BOND_GENL_ATTR_ML_RADDR,
+	BOND_GENL_ATTR_ML_MLADDR,
+	BOND_GENL_ATTR_ML_INDEX,
 	__BOND_GENL_ATTR_MAX,
 };
 
@@ -129,6 +140,10 @@ enum {
 	BOND_GENL_CMD_UNSPEC = 0,
 	BOND_GENL_CMD_GET_MODE,
 	BOND_GENL_SLAVE_LINK,
+	BOND_GENL_ML_CMD_RT_ADD,
+	BOND_GENL_ML_CMD_RT_DEL,
+	BOND_GENL_ML_CMD_RT_FLUSH,
+	BOND_GENL_ML_CMD_DISCOVERY,
 	__BOND_GENL_MAX,
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..02b653b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2921,10 +2921,28 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
 /* On bonding slaves other than the currently active slave, suppress
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
+ * Additionally, set skb->dev appropriately for the mode / action.
  */
 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
 {
 	struct net_device *dev = skb->dev;
+	struct iphdr *iph;
+
+	if (master->priv_flags & IFF_MASTER_ML) {
+		if (skb->protocol == htons(ETH_P_IP)) {
+			iph = ip_hdr(skb);
+			if (!iph)
+				goto out;
+
+			/* For ML, assign to master only if traffic is for
+			 * master, as slaves keep their assigned IP addresses
+			 */
+			if (!ip_route_input(skb, iph->daddr, iph->saddr, 0,
+					    master))
+				skb->dev = master;
+		}
+		return 0;
+	}
 
 	if (master->priv_flags & IFF_MASTER_ARPMON)
 		dev->last_rx = jiffies;
@@ -2941,19 +2959,22 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return 0;
+			goto out;
 
 		if (master->priv_flags & IFF_MASTER_ALB) {
 			if (skb->pkt_type != PACKET_BROADCAST &&
 			    skb->pkt_type != PACKET_MULTICAST)
-				return 0;
+				goto out;
 		}
 		if (master->priv_flags & IFF_MASTER_8023AD &&
 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-			return 0;
+			goto out;
 
 		return 1;
 	}
+
+out:
+	skb->dev = master;
 	return 0;
 }
 EXPORT_SYMBOL(__skb_bond_should_drop);
@@ -2981,6 +3002,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
 
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb->mac_len = skb->network_header - skb->mac_header;
+
 	/*
 	 * bonding note: skbs received on inactive slaves should only
 	 * be delivered to pkt handlers that are exact matches.  Also
@@ -2997,14 +3022,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
 		if (skb_bond_should_drop(skb, master)) {
 			skb->deliver_no_wcard = 1;
 			null_or_orig = orig_dev; /* deliver only exact match */
-		} else
-			skb->dev = master;
+		}
 	}
 
 	__this_cpu_inc(softnet_data.processed);
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-	skb->mac_len = skb->network_header - skb->mac_header;
 
 	pt_prev = NULL;
 
-- 
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html