lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 20 Apr 2018 08:21:00 -0700
From:   "Samudrala, Sridhar" <sridhar.samudrala@...el.com>
To:     "Michael S. Tsirkin" <mst@...hat.com>
Cc:     stephen@...workplumber.org, davem@...emloft.net,
        netdev@...r.kernel.org, virtualization@...ts.linux-foundation.org,
        virtio-dev@...ts.oasis-open.org, jesse.brandeburg@...el.com,
        alexander.h.duyck@...el.com, kubakici@...pl, jasowang@...hat.com,
        loseweigh@...il.com, jiri@...nulli.us
Subject: Re: [PATCH v7 net-next 2/4] net: Introduce generic failover module


On 4/19/2018 7:44 PM, Michael S. Tsirkin wrote:
> On Thu, Apr 19, 2018 at 06:42:02PM -0700, Sridhar Samudrala wrote:
>> This provides a generic interface for paravirtual drivers to listen
>> for netdev register/unregister/link change events from pci ethernet
>> devices with the same MAC and takeover their datapath. The notifier and
>> event handling code is based on the existing netvsc implementation.
>>
>> It exposes 2 sets of interfaces to the paravirtual drivers.
>> 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> master netdev is created. The paravirtual driver registers each instance
>> of netvsc as a 'failover' instance  along with a set of ops to manage the
>> slave events.
>>       failover_register()
>>       failover_unregister()
>> 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> the failover module provides interfaces to create/destroy additional master
>> netdev and all the slave events are managed internally.
>>        failover_create()
>>        failover_destroy()
>> These functions call failover_register()/failover_unregister() with the
>> master netdev created by the failover module.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com>
> I like this patch. Yes something to improve (see below)
>
>> ---
>>   include/linux/netdevice.h |  16 +
>>   include/net/failover.h    |  96 ++++++
>>   net/Kconfig               |  18 +
>>   net/core/Makefile         |   1 +
>>   net/core/failover.c       | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 975 insertions(+)
>>   create mode 100644 include/net/failover.h
>>   create mode 100644 net/core/failover.c
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index cf44503ea81a..ed535b6724e1 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1401,6 +1401,8 @@ struct net_device_ops {
>>    *	entity (i.e. the master device for bridged veth)
>>    * @IFF_MACSEC: device is a MACsec device
>>    * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
>> + * @IFF_FAILOVER: device is a failover master device
>> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
>>    */
>>   enum netdev_priv_flags {
>>   	IFF_802_1Q_VLAN			= 1<<0,
>> @@ -1430,6 +1432,8 @@ enum netdev_priv_flags {
>>   	IFF_PHONY_HEADROOM		= 1<<24,
>>   	IFF_MACSEC			= 1<<25,
>>   	IFF_NO_RX_HANDLER		= 1<<26,
>> +	IFF_FAILOVER			= 1<<27,
>> +	IFF_FAILOVER_SLAVE		= 1<<28,
>>   };
>>   
>>   #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
>> @@ -1458,6 +1462,8 @@ enum netdev_priv_flags {
>>   #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
>>   #define IFF_MACSEC			IFF_MACSEC
>>   #define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
>> +#define IFF_FAILOVER			IFF_FAILOVER
>> +#define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
>>   
>>   /**
>>    *	struct net_device - The DEVICE structure.
>> @@ -4308,6 +4314,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>>   	return dev->priv_flags & IFF_RXFH_CONFIGURED;
>>   }
>>   
>> +static inline bool netif_is_failover(const struct net_device *dev)
>> +{
>> +	return dev->priv_flags & IFF_FAILOVER;
>> +}
>> +
>> +static inline bool netif_is_failover_slave(const struct net_device *dev)
>> +{
>> +	return dev->priv_flags & IFF_FAILOVER_SLAVE;
>> +}
>> +
>>   /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>>   static inline void netif_keep_dst(struct net_device *dev)
>>   {
>> diff --git a/include/net/failover.h b/include/net/failover.h
>> new file mode 100644
>> index 000000000000..0b8601043d90
>> --- /dev/null
>> +++ b/include/net/failover.h
>> @@ -0,0 +1,96 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +#ifndef _NET_FAILOVER_H
>> +#define _NET_FAILOVER_H
>> +
>> +#include <linux/netdevice.h>
>> +
>> +struct failover_ops {
>> +	int (*slave_pre_register)(struct net_device *slave_dev,
>> +				  struct net_device *failover_dev);
>> +	int (*slave_join)(struct net_device *slave_dev,
>> +			  struct net_device *failover_dev);
>> +	int (*slave_pre_unregister)(struct net_device *slave_dev,
>> +				    struct net_device *failover_dev);
>> +	int (*slave_release)(struct net_device *slave_dev,
>> +			     struct net_device *failover_dev);
>> +	int (*slave_link_change)(struct net_device *slave_dev,
>> +				 struct net_device *failover_dev);
>> +	rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> +};
>> +
>> +struct failover {
>> +	struct list_head list;
>> +	struct net_device __rcu *failover_dev;
>> +	struct failover_ops __rcu *ops;
>> +};
>> +
>> +/* failover state */
>> +struct failover_info {
>> +	/* primary netdev with same MAC */
>> +	struct net_device __rcu *primary_dev;
>> +
>> +	/* standby netdev */
>> +	struct net_device __rcu *standby_dev;
>> +
>> +	/* primary netdev stats */
>> +	struct rtnl_link_stats64 primary_stats;
>> +
>> +	/* standby netdev stats */
>> +	struct rtnl_link_stats64 standby_stats;
>> +
>> +	/* aggregated stats */
>> +	struct rtnl_link_stats64 failover_stats;
>> +
>> +	/* spinlock while updating stats */
>> +	spinlock_t stats_lock;
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_NET_FAILOVER)
>> +
>> +int failover_create(struct net_device *standby_dev,
>> +		    struct failover **pfailover);
>> +void failover_destroy(struct failover *failover);
>> +
>> +int failover_register(struct net_device *standby_dev, struct failover_ops *ops,
>> +		      struct failover **pfailover);
>> +void failover_unregister(struct failover *failover);
>> +
>> +int failover_slave_unregister(struct net_device *slave_dev);
>> +
>> +#else
>> +
>> +static inline
>> +int failover_create(struct net_device *standby_dev,
>> +		    struct failover **pfailover);
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline
>> +void failover_destroy(struct failover *failover)
>> +{
>> +}
>> +
>> +static inline
>> +int failover_register(struct net_device *standby_dev, struct failover_ops *ops,
>> +		      struct pfailover **pfailover);
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline
>> +void failover_unregister(struct failover *failover)
>> +{
>> +}
>> +
>> +static inline
>> +int failover_slave_unregister(struct net_device *slave_dev)
>> +{
>> +	return 0;
>> +}
>> +
>> +#endif
>> +
>> +#endif /* _NET_FAILOVER_H */
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 0428f12c25c2..388b99dfee10 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>>   	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
>>   	  devlink is a loadable module and the driver using it is built-in.
>>   
>> +config NET_FAILOVER
>> +	tristate "Failover interface"
>> +	help
>> +	  This provides a generic interface for paravirtual drivers to listen
>> +	  for netdev register/unregister/link change events from pci ethernet
>> +	  devices with the same MAC and takeover their datapath. This also
>> +	  enables live migration of a VM with direct attached VF by failing
>> +	  over to the paravirtual datapath when the VF is unplugged.
>> +
>> +config MAY_USE_FAILOVER
>> +	tristate
>> +	default m if NET_FAILOVER=m
>> +	default y if NET_FAILOVER=y || NET_FAILOVER=n
>> +	help
>> +	  Drivers using the failover infrastructure should have a dependency
>> +	  on MAY_USE_FAILOVER to ensure they do not cause link errors when
>> +	  failover is a loadable module and the driver using it is built-in.
>> +
>>   endif   # if NET
>>   
>>   # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> diff --git a/net/core/Makefile b/net/core/Makefile
>> index 6dbbba8c57ae..cef17518bb7d 100644
>> --- a/net/core/Makefile
>> +++ b/net/core/Makefile
>> @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>>   obj-$(CONFIG_HWBM) += hwbm.o
>>   obj-$(CONFIG_NET_DEVLINK) += devlink.o
>>   obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> +obj-$(CONFIG_NET_FAILOVER) += failover.o
>> diff --git a/net/core/failover.c b/net/core/failover.c
>> new file mode 100644
>> index 000000000000..7bee762cb737
>> --- /dev/null
>> +++ b/net/core/failover.c
>> @@ -0,0 +1,844 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>
> I think you should copy the header from bond_main.c upon which
> some of the code seems to be based.

Yes. some of the code is based on bonding/team and netvsc. i added a reference
to netvsc in the comment, will also include bonding/team driver.

>
>> +
>> +/* A common module to handle registrations and notifications for paravirtual
>> + * drivers to enable accelerated datapath and support VF live migration.
>> + *
>> + * The notifier and event handling code is based on netvsc driver.
>> + */
>> +
>> +#include <linux/netdevice.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/ethtool.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>> +#include <linux/rtnetlink.h>
>> +#include <linux/if_vlan.h>
>> +#include <linux/pci.h>
>> +#include <net/sch_generic.h>
>> +#include <uapi/linux/if_arp.h>
>> +#include <net/failover.h>
>> +
>> +static LIST_HEAD(failover_list);
>> +static DEFINE_SPINLOCK(failover_lock);
>> +
>> +static int failover_slave_pre_register(struct net_device *slave_dev,
>> +				       struct net_device *failover_dev,
>> +				       struct failover_ops *failover_ops)
>> +{
>> +	struct failover_info *finfo;
>> +	bool standby;
>> +
>> +	if (failover_ops) {
>> +		if (!failover_ops->slave_pre_register)
>> +			return -EINVAL;
>> +
>> +		return failover_ops->slave_pre_register(slave_dev,
>> +							failover_dev);
>> +	}
>> +
>> +	finfo = netdev_priv(failover_dev);
>> +	standby = (slave_dev->dev.parent == failover_dev->dev.parent);
>> +	if (standby ? rtnl_dereference(finfo->standby_dev) :
>> +			rtnl_dereference(finfo->primary_dev)) {
>> +		netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
>> +			   slave_dev->name, standby ? "standby" : "primary");
>> +		return -EEXIST;
>> +	}
>> +
>> +	/* Avoid non pci devices as primary netdev */
> Why? Pls change this comment so it explains the motivation
> rather than just repeat what the code does.

OK.

>
>> +	if (!standby && (!slave_dev->dev.parent ||
>> +			 !dev_is_pci(slave_dev->dev.parent)))
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>> +static int failover_slave_join(struct net_device *slave_dev,
>> +			       struct net_device *failover_dev,
>> +			       struct failover_ops *failover_ops)
>> +{
>> +	struct failover_info *finfo;
>> +	int err, orig_mtu;
>> +	bool standby;
>> +
>> +	if (failover_ops) {
>> +		if (!failover_ops->slave_join)
>> +			return -EINVAL;
>> +
>> +		return failover_ops->slave_join(slave_dev, failover_dev);
>> +	}
>> +
>> +	if (netif_running(failover_dev)) {
>> +		err = dev_open(slave_dev);
>> +		if (err && (err != -EBUSY)) {
>> +			netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
>> +				   slave_dev->name, err);
>> +			goto err_dev_open;
>> +		}
>> +	}
>> +
>> +	/* Align MTU of slave with failover dev */
>> +	orig_mtu = slave_dev->mtu;
> I suspect this was copied from bond. this variable is never
> used and I'm even surprised gcc did not warn about this.

Thanks for catching, I broke this when i moved the dev_open() and dev_set_mtu()
calls from register to join. I need to reset slave_dev mtu to orig_mtu on failure.


>
>
>> +	err = dev_set_mtu(slave_dev, failover_dev->mtu);
> How do we know slave supports this MTU? same applies to
> failover_change_mtu.

The err check below should catch it and we will reset the mtu back and
fail the join/register.

>
>
>
>
>> +	if (err) {
>> +		netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
>> +			   slave_dev->name, failover_dev->mtu);
>> +		goto err_set_mtu;
>> +	}
>> +
>> +	finfo = netdev_priv(failover_dev);
>> +	standby = (slave_dev->dev.parent == failover_dev->dev.parent);
>> +
>> +	dev_hold(slave_dev);
>> +
>> +	if (standby) {
>> +		rcu_assign_pointer(finfo->standby_dev, slave_dev);
>> +		dev_get_stats(finfo->standby_dev, &finfo->standby_stats);
>> +	} else {
>> +		rcu_assign_pointer(finfo->primary_dev, slave_dev);
>> +		dev_get_stats(finfo->primary_dev, &finfo->primary_stats);
>> +		failover_dev->min_mtu = slave_dev->min_mtu;
>> +		failover_dev->max_mtu = slave_dev->max_mtu;
>> +	}
>> +
>> +	netdev_info(failover_dev, "failover slave:%s joined\n",
>> +		    slave_dev->name);
>> +
>> +	return 0;
>> +
>> +err_set_mtu:
>> +	dev_close(slave_dev);
>> +err_dev_open:
>> +	return err;
>> +}
>> +
>> +/* Called when slave dev is injecting data into network stack.
>> + * Change the associated network device from lower dev to virtio.
>> + * note: already called with rcu_read_lock
>> + */
>> +static rx_handler_result_t failover_handle_frame(struct sk_buff **pskb)
>> +{
>> +	struct sk_buff *skb = *pskb;
>> +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> +
>> +	skb->dev = ndev;
>> +
>> +	return RX_HANDLER_ANOTHER;
>> +}
>> +
>> +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
>> +{
>> +	struct net_device *failover_dev;
>> +	struct failover *failover;
>> +
>> +	spin_lock(&failover_lock);
>> +	list_for_each_entry(failover, &failover_list, list) {
>> +		failover_dev = rtnl_dereference(failover->failover_dev);
>> +		if (ether_addr_equal(failover_dev->perm_addr, mac)) {
>> +			*ops = rtnl_dereference(failover->ops);
>> +			spin_unlock(&failover_lock);
>> +			return failover_dev;
>> +		}
>> +	}
>> +	spin_unlock(&failover_lock);
>> +	return NULL;
>> +}
>> +
>> +static int failover_slave_register(struct net_device *slave_dev)
>> +{
>> +	struct failover_ops *failover_ops;
>> +	struct net_device *failover_dev;
>> +	int ret;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &failover_ops);
>> +	if (!failover_dev)
>> +		goto done;
>> +
>> +	ret = failover_slave_pre_register(slave_dev, failover_dev,
>> +					  failover_ops);
>> +	if (ret)
>> +		goto done;
>> +
>> +	ret = netdev_rx_handler_register(slave_dev, failover_ops ?
>> +					 failover_ops->handle_frame :
>> +					 failover_handle_frame, failover_dev);
>> +	if (ret) {
>> +		netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>> +			   ret);
>> +		goto done;
>> +	}
>> +
>> +	ret = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>> +	if (ret) {
>> +		netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
>> +			   failover_dev->name, ret);
>> +		goto upper_link_failed;
>> +	}
>> +
>> +	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
>> +
>> +	ret = failover_slave_join(slave_dev, failover_dev, failover_ops);
>> +	if (ret)
>> +		goto err_join;
>> +
>> +	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
>> +
>> +	netdev_info(failover_dev, "failover slave:%s registered\n",
>> +		    slave_dev->name);
>> +
>> +	goto done;
>> +
>> +err_join:
>> +	netdev_upper_dev_unlink(slave_dev, failover_dev);
>> +	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
>> +upper_link_failed:
>> +	netdev_rx_handler_unregister(slave_dev);
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +
>> +static int failover_slave_pre_unregister(struct net_device *slave_dev,
>> +					 struct net_device *failover_dev,
>> +					 struct failover_ops *failover_ops)
>> +{
>> +	struct net_device *standby_dev, *primary_dev;
>> +	struct failover_info *finfo;
>> +
>> +	if (failover_ops) {
>> +		if (!failover_ops->slave_pre_unregister)
>> +			return -EINVAL;
>> +
>> +		return failover_ops->slave_pre_unregister(slave_dev,
>> +							  failover_dev);
>> +	}
>> +
>> +	finfo = netdev_priv(failover_dev);
>> +	primary_dev = rtnl_dereference(finfo->primary_dev);
>> +	standby_dev = rtnl_dereference(finfo->standby_dev);
>> +
>> +	if (slave_dev != primary_dev && slave_dev != standby_dev)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>> +static int failover_slave_release(struct net_device *slave_dev,
>> +				  struct net_device *failover_dev,
>> +				  struct failover_ops *failover_ops)
>> +{
>> +	struct net_device *standby_dev, *primary_dev;
>> +	struct failover_info *finfo;
>> +
>> +	if (failover_ops) {
>> +		if (!failover_ops->slave_release)
>> +			return -EINVAL;
>> +
>> +		return failover_ops->slave_release(slave_dev, failover_dev);
>> +	}
>> +
>> +	finfo = netdev_priv(failover_dev);
>> +	primary_dev = rtnl_dereference(finfo->primary_dev);
>> +	standby_dev = rtnl_dereference(finfo->standby_dev);
>> +
>> +	if (slave_dev == standby_dev) {
>> +		RCU_INIT_POINTER(finfo->standby_dev, NULL);
>> +	} else {
>> +		RCU_INIT_POINTER(finfo->primary_dev, NULL);
>> +		if (standby_dev) {
>> +			failover_dev->min_mtu = standby_dev->min_mtu;
>> +			failover_dev->max_mtu = standby_dev->max_mtu;
>> +		}
>> +	}
>> +
>> +	dev_put(slave_dev);
>> +
>> +	netdev_info(failover_dev, "failover slave:%s released\n",
>> +		    slave_dev->name);
>> +
>> +	return 0;
>> +}
>> +
>> +int failover_slave_unregister(struct net_device *slave_dev)
>> +{
>> +	struct failover_ops *failover_ops;
>> +	struct net_device *failover_dev;
>> +	int ret;
>> +
>> +	if (!netif_is_failover_slave(slave_dev))
>> +		goto done;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &failover_ops);
>> +	if (!failover_dev)
>> +		goto done;
>> +
>> +	ret = failover_slave_pre_unregister(slave_dev, failover_dev,
>> +					    failover_ops);
>> +	if (ret)
>> +		goto done;
>> +
>> +	netdev_rx_handler_unregister(slave_dev);
>> +	netdev_upper_dev_unlink(slave_dev, failover_dev);
>> +	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
>> +
>> +	failover_slave_release(slave_dev, failover_dev, failover_ops);
>
> Don't you need to get stats from it? This device is going away ...

Yes. we need to update the failover_stats before the slave goes away.

>
>> +
>> +	netdev_info(failover_dev, "failover slave:%s unregistered\n",
>> +		    slave_dev->name);
>> +
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +EXPORT_SYMBOL_GPL(failover_slave_unregister);
>> +
>> +static bool failover_xmit_ready(struct net_device *dev)
>> +{
>> +	return netif_running(dev) && netif_carrier_ok(dev);
>> +}
>> +
>> +static int failover_slave_link_change(struct net_device *slave_dev)
>> +{
>> +	struct net_device *failover_dev, *primary_dev, *standby_dev;
>> +	struct failover_ops *failover_ops;
>> +	struct failover_info *finfo;
>> +
>> +	if (!netif_is_failover_slave(slave_dev))
>> +		goto done;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &failover_ops);
>> +	if (!failover_dev)
>> +		goto done;
>> +
>> +	if (failover_ops) {
>> +		if (!failover_ops->slave_link_change)
>> +			goto done;
>> +
>> +		return failover_ops->slave_link_change(slave_dev, failover_dev);
>> +	}
>> +
>> +	if (!netif_running(failover_dev))
>> +		return 0;
>> +
>> +	finfo = netdev_priv(failover_dev);
>> +
>> +	primary_dev = rtnl_dereference(finfo->primary_dev);
>> +	standby_dev = rtnl_dereference(finfo->standby_dev);
>> +
>> +	if (slave_dev != primary_dev && slave_dev != standby_dev)
>> +		goto done;
>> +
>> +	if ((primary_dev && failover_xmit_ready(primary_dev)) ||
>> +	    (standby_dev && failover_xmit_ready(standby_dev))) {
>> +		netif_carrier_on(failover_dev);
>> +		netif_tx_wake_all_queues(failover_dev);
>> +	} else {
>> +		netif_carrier_off(failover_dev);
>> +		netif_tx_stop_all_queues(failover_dev);
> And I think it's a good idea to get stats from device here too.

Not sure why we need to get stats from lower devs here?


>
>
>> +	}
>> +
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +
>> +static bool failover_validate_event_dev(struct net_device *dev)
>> +{
>> +	/* Skip parent events */
>> +	if (netif_is_failover(dev))
>> +		return false;
>> +
>> +	/* Avoid non-Ethernet type devices */
> ... for now. It would be possible easy to make this generic -
> just copy things like type and addr_len from slave.

ok. failover_create can copy these values from standby_dev and
we can validate that they match when primary_dev registers.

>
>> +	if (dev->type != ARPHRD_ETHER)
>> +		return false;
>> +
>> +	return true;
>> +}
>> +
>> +static int
>> +failover_event(struct notifier_block *this, unsigned long event, void *ptr)
>> +{
>> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> +
>> +	if (!failover_validate_event_dev(event_dev))
>> +		return NOTIFY_DONE;
>> +
>> +	switch (event) {
>> +	case NETDEV_REGISTER:
>> +		return failover_slave_register(event_dev);
>> +	case NETDEV_UNREGISTER:
>> +		return failover_slave_unregister(event_dev);
>> +	case NETDEV_UP:
>> +	case NETDEV_DOWN:
>> +	case NETDEV_CHANGE:
>> +		return failover_slave_link_change(event_dev);
>> +	default:
>> +		return NOTIFY_DONE;
>> +	}
>> +}
>> +
>> +static struct notifier_block failover_notifier = {
>> +	.notifier_call = failover_event,
>> +};
>> +
>> +static int failover_open(struct net_device *dev)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *primary_dev, *standby_dev;
>> +	int err;
>> +
>> +	netif_carrier_off(dev);
>> +	netif_tx_wake_all_queues(dev);
>> +
>> +	primary_dev = rtnl_dereference(finfo->primary_dev);
>> +	if (primary_dev) {
>> +		err = dev_open(primary_dev);
>> +		if (err)
>> +			goto err_primary_open;
>> +	}
>> +
>> +	standby_dev = rtnl_dereference(finfo->standby_dev);
>> +	if (standby_dev) {
>> +		err = dev_open(standby_dev);
>> +		if (err)
>> +			goto err_standby_open;
>> +	}
>> +
>> +	return 0;
>> +
>> +err_standby_open:
>> +	dev_close(primary_dev);
>> +err_primary_open:
>> +	netif_tx_disable(dev);
>> +	return err;
>> +}
>> +
>> +static int failover_close(struct net_device *dev)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *slave_dev;
>> +
>> +	netif_tx_disable(dev);
>> +
>> +	slave_dev = rtnl_dereference(finfo->primary_dev);
>> +	if (slave_dev)
>> +		dev_close(slave_dev);
>> +
>> +	slave_dev = rtnl_dereference(finfo->standby_dev);
>> +	if (slave_dev)
>> +		dev_close(slave_dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static netdev_tx_t failover_drop_xmit(struct sk_buff *skb,
>> +				      struct net_device *dev)
>> +{
>> +	atomic_long_inc(&dev->tx_dropped);
>> +	dev_kfree_skb_any(skb);
>> +	return NETDEV_TX_OK;
>> +}
>> +
>> +static netdev_tx_t failover_start_xmit(struct sk_buff *skb,
>> +				       struct net_device *dev)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *xmit_dev;
>> +
>> +	/* Try xmit via primary netdev followed by standby netdev */
>> +	xmit_dev = rcu_dereference_bh(finfo->primary_dev);
>> +	if (!xmit_dev || !failover_xmit_ready(xmit_dev)) {
>> +		xmit_dev = rcu_dereference_bh(finfo->standby_dev);
>> +		if (!xmit_dev || !failover_xmit_ready(xmit_dev))
>> +			return failover_drop_xmit(skb, dev);
>> +	}
>> +
>> +	skb->dev = xmit_dev;
>> +	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
>> +
>> +	return dev_queue_xmit(skb);
>> +}
> Is this going through qdisc twice? Won't this hurt performance
> measureably?

The failover dev has no queue (IFF_NO_QUEUE) , so doesn't go through qdisc twice.

>
>> +
>> +static u16 failover_select_queue(struct net_device *dev, struct sk_buff *skb,
>> +				 void *accel_priv,
>> +				 select_queue_fallback_t fallback)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *primary_dev;
>> +	u16 txq;
>> +
>> +	rcu_read_lock();
>> +	primary_dev = rcu_dereference(finfo->primary_dev);
>> +	if (primary_dev) {
>> +		const struct net_device_ops *ops = primary_dev->netdev_ops;
>> +
>> +		if (ops->ndo_select_queue)
>> +			txq = ops->ndo_select_queue(primary_dev, skb,
>> +						    accel_priv, fallback);
>> +		else
>> +			txq = fallback(primary_dev, skb);
>> +
>> +		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>> +
>> +		return txq;
>> +	}
>> +
>> +	txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
>> +
>> +	/* Save the original txq to restore before passing to the driver */
>> +	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>> +
>> +	if (unlikely(txq >= dev->real_num_tx_queues)) {
>> +		do {
>> +			txq -= dev->real_num_tx_queues;
>> +		} while (txq >= dev->real_num_tx_queues);
>> +	}
>> +
>> +	return txq;
>> +}
>> +
>> +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
>> + * that some drivers can provide 32finfot values only.
>> + */
>> +static void failover_fold_stats(struct rtnl_link_stats64 *_res,
>> +				const struct rtnl_link_stats64 *_new,
>> +				const struct rtnl_link_stats64 *_old)
>> +{
>> +	const u64 *new = (const u64 *)_new;
>> +	const u64 *old = (const u64 *)_old;
>> +	u64 *res = (u64 *)_res;
>> +	int i;
>> +
>> +	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
>> +		u64 nv = new[i];
>> +		u64 ov = old[i];
>> +		s64 delta = nv - ov;
>> +
>> +		/* detects if this particular field is 32bit only */
>> +		if (((nv | ov) >> 32) == 0)
>> +			delta = (s64)(s32)((u32)nv - (u32)ov);
>> +
>> +		/* filter anomalies, some drivers reset their stats
>> +		 * at down/up events.
>> +		 */
>> +		if (delta > 0)
>> +			res[i] += delta;
>> +	}
>> +}
>> +
>> +static void failover_get_stats(struct net_device *dev,
>> +			       struct rtnl_link_stats64 *stats)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	const struct rtnl_link_stats64 *new;
>> +	struct rtnl_link_stats64 temp;
>> +	struct net_device *slave_dev;
>> +
>> +	spin_lock(&finfo->stats_lock);
>> +	memcpy(stats, &finfo->failover_stats, sizeof(*stats));
>> +
>> +	rcu_read_lock();
>> +
>> +	slave_dev = rcu_dereference(finfo->primary_dev);
>> +	if (slave_dev) {
>> +		new = dev_get_stats(slave_dev, &temp);
>> +		failover_fold_stats(stats, new, &finfo->primary_stats);
>> +		memcpy(&finfo->primary_stats, new, sizeof(*new));
>> +	}
>> +
>> +	slave_dev = rcu_dereference(finfo->standby_dev);
>> +	if (slave_dev) {
>> +		new = dev_get_stats(slave_dev, &temp);
>> +		failover_fold_stats(stats, new, &finfo->standby_stats);
>> +		memcpy(&finfo->standby_stats, new, sizeof(*new));
>> +	}
>> +
>> +	rcu_read_unlock();
>> +
>> +	memcpy(&finfo->failover_stats, stats, sizeof(*stats));
>> +	spin_unlock(&finfo->stats_lock);
>> +}
>> +
>> +static int failover_change_mtu(struct net_device *dev, int new_mtu)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *primary_dev, *standby_dev;
>> +	int ret = 0;
>> +
>> +	primary_dev = rcu_dereference(finfo->primary_dev);
>> +	if (primary_dev) {
>> +		ret = dev_set_mtu(primary_dev, new_mtu);
>> +		if (ret)
>> +			return ret;
>> +	}
>> +
>> +	standby_dev = rcu_dereference(finfo->standby_dev);
>> +	if (standby_dev) {
>> +		ret = dev_set_mtu(standby_dev, new_mtu);
>> +		if (ret) {
>> +			dev_set_mtu(primary_dev, dev->mtu);
>> +			return ret;
>> +		}
>> +	}
>> +
>> +	dev->mtu = new_mtu;
>> +	return 0;
>> +}
>> +
>> +static void failover_set_rx_mode(struct net_device *dev)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *slave_dev;
>> +
>> +	rcu_read_lock();
>> +
>> +	slave_dev = rcu_dereference(finfo->primary_dev);
>> +	if (slave_dev) {
>> +		dev_uc_sync_multiple(slave_dev, dev);
>> +		dev_mc_sync_multiple(slave_dev, dev);
>> +	}
>> +
>> +	slave_dev = rcu_dereference(finfo->standby_dev);
>> +	if (slave_dev) {
>> +		dev_uc_sync_multiple(slave_dev, dev);
>> +		dev_mc_sync_multiple(slave_dev, dev);
>> +	}
>> +
>> +	rcu_read_unlock();
>> +}
>> +
>> +static const struct net_device_ops failover_dev_ops = {
>> +	.ndo_open		= failover_open,
>> +	.ndo_stop		= failover_close,
>> +	.ndo_start_xmit		= failover_start_xmit,
>> +	.ndo_select_queue	= failover_select_queue,
>> +	.ndo_get_stats64	= failover_get_stats,
>> +	.ndo_change_mtu		= failover_change_mtu,
>> +	.ndo_set_rx_mode	= failover_set_rx_mode,
>> +	.ndo_validate_addr	= eth_validate_addr,
>> +	.ndo_features_check	= passthru_features_check,
> xdp support?

I think it should be possible to add it be calling the lower dev ndo_xdp routines
with proper checks. can we add this later?

>
>> +};
>> +
>> +#define FAILOVER_NAME "failover"
>> +#define FAILOVER_VERSION "0.1"
>> +
>> +static void failover_ethtool_get_drvinfo(struct net_device *dev,
>> +					 struct ethtool_drvinfo *drvinfo)
>> +{
>> +	strlcpy(drvinfo->driver, FAILOVER_NAME, sizeof(drvinfo->driver));
>> +	strlcpy(drvinfo->version, FAILOVER_VERSION, sizeof(drvinfo->version));
>> +}
>> +
>> +int failover_ethtool_get_link_ksettings(struct net_device *dev,
>> +					struct ethtool_link_ksettings *cmd)
>> +{
>> +	struct failover_info *finfo = netdev_priv(dev);
>> +	struct net_device *slave_dev;
>> +
>> +	slave_dev = rtnl_dereference(finfo->primary_dev);
>> +	if (!slave_dev || !failover_xmit_ready(slave_dev)) {
>> +		slave_dev = rtnl_dereference(finfo->standby_dev);
>> +		if (!slave_dev || !failover_xmit_ready(slave_dev)) {
>> +			cmd->base.duplex = DUPLEX_UNKNOWN;
>> +			cmd->base.port = PORT_OTHER;
>> +			cmd->base.speed = SPEED_UNKNOWN;
>> +
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	return __ethtool_get_link_ksettings(slave_dev, cmd);
>> +}
>> +EXPORT_SYMBOL_GPL(failover_ethtool_get_link_ksettings);
>> +
>> +static const struct ethtool_ops failover_ethtool_ops = {
>> +	.get_drvinfo            = failover_ethtool_get_drvinfo,
>> +	.get_link               = ethtool_op_get_link,
>> +	.get_link_ksettings     = failover_ethtool_get_link_ksettings,
>> +};
>> +
>> +static void failover_register_existing_slave(struct net_device *failover_dev)
>> +{
>> +	struct net *net = dev_net(failover_dev);
>> +	struct net_device *dev;
>> +
>> +	rtnl_lock();
>> +	for_each_netdev(net, dev) {
>> +		if (dev == failover_dev)
>> +			continue;
>> +		if (!failover_validate_event_dev(dev))
>> +			continue;
>> +		if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
>> +			failover_slave_register(dev);
>> +	}
>> +	rtnl_unlock();
>> +}
>> +
>> +int failover_register(struct net_device *dev, struct failover_ops *ops,
>> +		      struct failover **pfailover)
>> +{
>> +	struct failover *failover;
>> +
>> +	failover = kzalloc(sizeof(*failover), GFP_KERNEL);
>> +	if (!failover)
>> +		return -ENOMEM;
>> +
>> +	rcu_assign_pointer(failover->ops, ops);
>> +	dev_hold(dev);
>> +	dev->priv_flags |= IFF_FAILOVER;
>> +	rcu_assign_pointer(failover->failover_dev, dev);
>> +
>> +	spin_lock(&failover_lock);
>> +	list_add_tail(&failover->list, &failover_list);
>> +	spin_unlock(&failover_lock);
>> +
>> +	failover_register_existing_slave(dev);
>> +
>> +	*pfailover = failover;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(failover_register);
>> +
>> +void failover_unregister(struct failover *failover)
>> +{
>> +	struct net_device *failover_dev;
>> +
>> +	failover_dev = rcu_dereference(failover->failover_dev);
>> +
>> +	failover_dev->priv_flags &= ~IFF_FAILOVER;
>> +	dev_put(failover_dev);
>> +
>> +	spin_lock(&failover_lock);
>> +	list_del(&failover->list);
>> +	spin_unlock(&failover_lock);
>> +
>> +	kfree(failover);
>> +}
>> +EXPORT_SYMBOL_GPL(failover_unregister);
>> +
>> +int failover_create(struct net_device *standby_dev, struct failover **pfailover)
>> +{
>> +	struct device *dev = standby_dev->dev.parent;
>> +	struct net_device *failover_dev;
>> +	int err;
>> +
>> +	/* Alloc at least 2 queues, for now we are going with 16 assuming
>> +	 * that most devices being bonded won't have too many queues.
>> +	 */
>> +	failover_dev = alloc_etherdev_mq(sizeof(struct failover_info), 16);
>> +	if (!failover_dev) {
>> +		dev_err(dev, "Unable to allocate failover_netdev!\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	dev_net_set(failover_dev, dev_net(standby_dev));
>> +	SET_NETDEV_DEV(failover_dev, dev);
>> +
>> +	failover_dev->netdev_ops = &failover_dev_ops;
>> +	failover_dev->ethtool_ops = &failover_ethtool_ops;
>> +
>> +	/* Initialize the device options */
>> +	failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> +	failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> +				       IFF_TX_SKB_SHARING);
>> +
>> +	/* don't acquire failover netdev's netif_tx_lock when transmitting */
>> +	failover_dev->features |= NETIF_F_LLTX;
>> +
>> +	/* Don't allow failover devices to change network namespaces. */
>> +	failover_dev->features |= NETIF_F_NETNS_LOCAL;
>> +
>> +	failover_dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> +				    NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> +				    NETIF_F_HIGHDMA | NETIF_F_LRO;
> OK but then you must make sure your primary and standby both
> support these features.

I guess i need to add something like bond_compute_features() to handle this.

>
>> +
>> +	failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> +	failover_dev->features |= failover_dev->hw_features;
>> +
>> +	memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
>> +	       failover_dev->addr_len);
>> +
>> +	failover_dev->min_mtu = standby_dev->min_mtu;
>> +	failover_dev->max_mtu = standby_dev->max_mtu;
> OK MTU is copied, fine. But is this always enough?
>
> How about e.g. hard_header_len? min_header_len? needed_headroom?
> needed_tailroom? I'd worry that even if you cover existing ones more
> might be added with time.  A function copying config between devices
> probably belongs in some central place IMHO.

ok. will add a function that handles these fields too.

>
>
>> +
>> +	err = register_netdev(failover_dev);
>> +	if (err < 0) {
>> +		dev_err(dev, "Unable to register failover_dev!\n");
>> +		goto err_register_netdev;
>> +	}
>> +
>> +	netif_carrier_off(failover_dev);
>> +
>> +	err = failover_register(failover_dev, NULL, pfailover);
>> +	if (err < 0)
>> +		goto err_failover;
>> +
>> +	return 0;
>> +
>> +err_failover:
>> +	unregister_netdev(failover_dev);
>> +err_register_netdev:
>> +	free_netdev(failover_dev);
>> +
>> +	return err;
>> +}
>> +EXPORT_SYMBOL_GPL(failover_create);
>> +
>> +void failover_destroy(struct failover *failover)
>> +{
>> +	struct net_device *failover_dev;
>> +	struct net_device *slave_dev;
>> +	struct failover_info *finfo;
>> +
>> +	if (!failover)
>> +		return;
>> +
>> +	failover_dev = rcu_dereference(failover->failover_dev);
>> +	finfo = netdev_priv(failover_dev);
>> +
>> +	netif_device_detach(failover_dev);
>> +
>> +	rtnl_lock();
>> +
>> +	slave_dev = rtnl_dereference(finfo->primary_dev);
>> +	if (slave_dev)
>> +		failover_slave_unregister(slave_dev);
>> +
>> +	slave_dev = rtnl_dereference(finfo->standby_dev);
>> +	if (slave_dev)
>> +		failover_slave_unregister(slave_dev);
>> +
>> +	failover_unregister(failover);
>> +
>> +	unregister_netdevice(failover_dev);
>> +
>> +	rtnl_unlock();
>> +
>> +	free_netdev(failover_dev);
>> +}
>> +EXPORT_SYMBOL_GPL(failover_destroy);
>> +
>> +static __init int
>> +failover_init(void)
>> +{
>> +	register_netdevice_notifier(&failover_notifier);
>> +
>> +	return 0;
>> +}
>> +module_init(failover_init);
>> +
>> +static __exit
>> +void failover_exit(void)
>> +{
>> +	unregister_netdevice_notifier(&failover_notifier);
>> +}
>> +module_exit(failover_exit);
>> +
>> +MODULE_DESCRIPTION("Failover infrastructure/interface for Paravirtual drivers");
>> +MODULE_LICENSE("GPL v2");
>> -- 
>> 2.14.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ