[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CADGSJ23tForX9HSvYpC-1wV9v_u15-mzeg8jY7nOO2gaVdjgMw@mail.gmail.com>
Date: Fri, 2 Mar 2018 13:11:56 -0800
From: Siwei Liu <loseweigh@...il.com>
To: Sridhar Samudrala <sridhar.samudrala@...el.com>
Cc: "Michael S. Tsirkin" <mst@...hat.com>,
Stephen Hemminger <stephen@...workplumber.org>,
David Miller <davem@...emloft.net>,
Netdev <netdev@...r.kernel.org>, Jiri Pirko <jiri@...nulli.us>,
virtio-dev@...ts.oasis-open.org,
"Brandeburg, Jesse" <jesse.brandeburg@...el.com>,
Alexander Duyck <alexander.h.duyck@...el.com>,
Jakub Kicinski <kubakici@...pl>
Subject: Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available
On Thu, Mar 1, 2018 at 12:08 PM, Sridhar Samudrala
<sridhar.samudrala@...el.com> wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
>
> The hypervisor needs to enable only one datapath at any time so that
> packets don't get looped back to the VM over the other datapath. When a VF
> is plugged, the virtio datapath link state can be marked as down. The
> hypervisor needs to unplug the VF device from the guest on the source host
> and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed,
> the destination hypervisor sets the MAC filter on the VF and plugs it back
> to the guest to switch over to VF datapath.
>
> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> created that acts as a master device and tracks the state of the 2 lower
> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> passthru device with the same MAC is registered as 'active' netdev.
>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@...el.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@...el.com>
> ---
> drivers/net/virtio_net.c | 683 ++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 682 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index bcd13fe906ca..f2860d86c952 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -30,6 +30,8 @@
> #include <linux/cpu.h>
> #include <linux/average.h>
> #include <linux/filter.h>
> +#include <linux/netdevice.h>
> +#include <linux/pci.h>
> #include <net/route.h>
> #include <net/xdp.h>
>
> @@ -206,6 +208,9 @@ struct virtnet_info {
> u32 speed;
>
> unsigned long guest_offloads;
> +
> + /* upper netdev created when BACKUP feature enabled */
> + struct net_device *bypass_netdev;
> };
>
> struct padded_vnet_hdr {
> @@ -2236,6 +2241,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> }
> }
>
> +static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
> + size_t len)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + int ret;
> +
> + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> + return -EOPNOTSUPP;
> +
> + ret = snprintf(buf, len, "_bkup");
> + if (ret >= len)
> + return -EOPNOTSUPP;
> +
> + return 0;
> +}
> +
What if the systemd/udevd is not new enough to enforce the
n<phys_port_name> naming? Would virtio_bypass get a different name
than the original virtio_net? Should we detect this earlier and fall
back to legacy mode without creating the bypass netdev and ensalving
the VF?
> static const struct net_device_ops virtnet_netdev = {
> .ndo_open = virtnet_open,
> .ndo_stop = virtnet_close,
> @@ -2253,6 +2274,7 @@ static const struct net_device_ops virtnet_netdev = {
> .ndo_xdp_xmit = virtnet_xdp_xmit,
> .ndo_xdp_flush = virtnet_xdp_flush,
> .ndo_features_check = passthru_features_check,
> + .ndo_get_phys_port_name = virtnet_get_phys_port_name,
> };
>
> static void virtnet_config_changed_work(struct work_struct *work)
> @@ -2647,6 +2669,653 @@ static int virtnet_validate(struct virtio_device *vdev)
> return 0;
> }
>
> +/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
> + * When BACKUP feature is enabled, an additional netdev(bypass netdev)
> + * is created that acts as a master device and tracks the state of the
> + * 2 lower netdevs. The original virtio_net netdev is registered as
> + * 'backup' netdev and a passthru device with the same MAC is registered
> + * as 'active' netdev.
> + */
> +
> +/* bypass state maintained when BACKUP feature is enabled */
> +struct virtnet_bypass_info {
> + /* passthru netdev with same MAC */
> + struct net_device __rcu *active_netdev;
> +
> + /* virtio_net netdev */
> + struct net_device __rcu *backup_netdev;
> +
> + /* active netdev stats */
> + struct rtnl_link_stats64 active_stats;
> +
> + /* backup netdev stats */
> + struct rtnl_link_stats64 backup_stats;
> +
> + /* aggregated stats */
> + struct rtnl_link_stats64 bypass_stats;
> +
> + /* spinlock while updating stats */
> + spinlock_t stats_lock;
> +};
> +
> +static void virtnet_bypass_child_open(struct net_device *dev,
> + struct net_device *child_netdev)
> +{
> + int err = dev_open(child_netdev);
> +
> + if (err)
> + netdev_warn(dev, "unable to open slave: %s: %d\n",
> + child_netdev->name, err);
> +}
> +
> +static int virtnet_bypass_open(struct net_device *dev)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> +
> + netif_carrier_off(dev);
> + netif_tx_wake_all_queues(dev);
> +
> + child_netdev = rtnl_dereference(vbi->active_netdev);
> + if (child_netdev)
> + virtnet_bypass_child_open(dev, child_netdev);
> +
> + child_netdev = rtnl_dereference(vbi->backup_netdev);
> + if (child_netdev)
> + virtnet_bypass_child_open(dev, child_netdev);
> +
> + return 0;
> +}
> +
> +static int virtnet_bypass_close(struct net_device *dev)
> +{
> + struct virtnet_bypass_info *vi = netdev_priv(dev);
> + struct net_device *child_netdev;
> +
> + netif_tx_disable(dev);
> +
> + child_netdev = rtnl_dereference(vi->active_netdev);
> + if (child_netdev)
> + dev_close(child_netdev);
> +
> + child_netdev = rtnl_dereference(vi->backup_netdev);
> + if (child_netdev)
> + dev_close(child_netdev);
> +
> + return 0;
> +}
> +
> +static netdev_tx_t virtnet_bypass_drop_xmit(struct sk_buff *skb,
> + struct net_device *dev)
> +{
> + atomic_long_inc(&dev->tx_dropped);
> + dev_kfree_skb_any(skb);
> + return NETDEV_TX_OK;
> +}
> +
> +static bool virtnet_bypass_xmit_ready(struct net_device *dev)
> +{
> + return netif_running(dev) && netif_carrier_ok(dev);
> +}
> +
> +static netdev_tx_t virtnet_bypass_start_xmit(struct sk_buff *skb,
> + struct net_device *dev)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *xmit_dev;
> +
> + /* Try xmit via active netdev followed by backup netdev */
> + xmit_dev = rcu_dereference_bh(vbi->active_netdev);
> + if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev)) {
> + xmit_dev = rcu_dereference_bh(vbi->backup_netdev);
> + if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
> + return virtnet_bypass_drop_xmit(skb, dev);
> + }
> +
> + skb->dev = xmit_dev;
> + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> + return dev_queue_xmit(skb);
> +}
> +
> +static u16 virtnet_bypass_select_queue(struct net_device *dev,
> + struct sk_buff *skb, void *accel_priv,
> + select_queue_fallback_t fallback)
> +{
> + /* This helper function exists to help dev_pick_tx get the correct
> + * destination queue. Using a helper function skips a call to
> + * skb_tx_hash and will put the skbs in the queue we expect on their
> + * way down to the bonding driver.
> + */
> + u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
> +
> + /* Save the original txq to restore before passing to the driver */
> + qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
> +
> + if (unlikely(txq >= dev->real_num_tx_queues)) {
> + do {
> + txq -= dev->real_num_tx_queues;
> + } while (txq >= dev->real_num_tx_queues);
> + }
> +
> + return txq;
> +}
> +
> +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
> + * that some drivers can provide 32bit values only.
> + */
> +static void virtnet_bypass_fold_stats(struct rtnl_link_stats64 *_res,
> + const struct rtnl_link_stats64 *_new,
> + const struct rtnl_link_stats64 *_old)
> +{
> + const u64 *new = (const u64 *)_new;
> + const u64 *old = (const u64 *)_old;
> + u64 *res = (u64 *)_res;
> + int i;
> +
> + for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
> + u64 nv = new[i];
> + u64 ov = old[i];
> + s64 delta = nv - ov;
> +
> + /* detects if this particular field is 32bit only */
> + if (((nv | ov) >> 32) == 0)
> + delta = (s64)(s32)((u32)nv - (u32)ov);
> +
> + /* filter anomalies, some drivers reset their stats
> + * at down/up events.
> + */
> + if (delta > 0)
> + res[i] += delta;
> + }
> +}
> +
> +static void virtnet_bypass_get_stats(struct net_device *dev,
> + struct rtnl_link_stats64 *stats)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + const struct rtnl_link_stats64 *new;
> + struct rtnl_link_stats64 temp;
> + struct net_device *child_netdev;
> +
> + spin_lock(&vbi->stats_lock);
> + memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
> +
> + rcu_read_lock();
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, &temp);
> + virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
> + memcpy(&vbi->active_stats, new, sizeof(*new));
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, &temp);
> + virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
> + memcpy(&vbi->backup_stats, new, sizeof(*new));
> + }
> +
> + rcu_read_unlock();
> +
> + memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
> + spin_unlock(&vbi->stats_lock);
> +}
> +
> +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> + int ret = 0;
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + return ret;
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + netdev_err(child_netdev,
> + "Unexpected failure to set mtu to %d\n",
> + new_mtu);
Shouldn't we unwind the MTU config on active_netdev if failing to set
it on backup_netdev?
> + }
> +
> + dev->mtu = new_mtu;
> + return 0;
> +}
> +
> +static void virtnet_bypass_set_rx_mode(struct net_device *dev)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> +
> + rcu_read_lock();
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + dev_uc_sync_multiple(child_netdev, dev);
> + dev_mc_sync_multiple(child_netdev, dev);
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + dev_uc_sync_multiple(child_netdev, dev);
> + dev_mc_sync_multiple(child_netdev, dev);
> + }
> +
If VF comes up later than when set_rx_mode is called where do you sync
up the unicast and multicast address?
The rest looks good.
Thanks,
-Siwei
> + rcu_read_unlock();
> +}
> +
> +static const struct net_device_ops virtnet_bypass_netdev_ops = {
> + .ndo_open = virtnet_bypass_open,
> + .ndo_stop = virtnet_bypass_close,
> + .ndo_start_xmit = virtnet_bypass_start_xmit,
> + .ndo_select_queue = virtnet_bypass_select_queue,
> + .ndo_get_stats64 = virtnet_bypass_get_stats,
> + .ndo_change_mtu = virtnet_bypass_change_mtu,
> + .ndo_set_rx_mode = virtnet_bypass_set_rx_mode,
> + .ndo_validate_addr = eth_validate_addr,
> + .ndo_features_check = passthru_features_check,
> +};
> +
> +static int
> +virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
> + struct ethtool_link_ksettings *cmd)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> +
> + child_netdev = rtnl_dereference(vbi->active_netdev);
> + if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
> + child_netdev = rtnl_dereference(vbi->backup_netdev);
> + if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
> + cmd->base.duplex = DUPLEX_UNKNOWN;
> + cmd->base.port = PORT_OTHER;
> + cmd->base.speed = SPEED_UNKNOWN;
> +
> + return 0;
> + }
> + }
> +
> + return __ethtool_get_link_ksettings(child_netdev, cmd);
> +}
> +
> +#define BYPASS_DRV_NAME "virtnet_bypass"
> +#define BYPASS_DRV_VERSION "0.1"
> +
> +static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
> + struct ethtool_drvinfo *drvinfo)
> +{
> + strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
> + strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
> +}
> +
> +static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
> + .get_drvinfo = virtnet_bypass_ethtool_get_drvinfo,
> + .get_link = ethtool_op_get_link,
> + .get_link_ksettings = virtnet_bypass_ethtool_get_link_ksettings,
> +};
> +
> +static struct net_device *get_virtnet_bypass_bymac(struct net *net,
> + const u8 *mac)
> +{
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> + continue; /* not a virtnet_bypass device */
> +
> + if (ether_addr_equal(mac, dev->perm_addr))
> + return dev;
> + }
> +
> + return NULL;
> +}
> +
> +static struct net_device *
> +get_virtnet_bypass_byref(struct net_device *child_netdev)
> +{
> + struct net *net = dev_net(child_netdev);
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + struct virtnet_bypass_info *vbi;
> +
> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> + continue; /* not a virtnet_bypass device */
> +
> + vbi = netdev_priv(dev);
> +
> + if ((rtnl_dereference(vbi->active_netdev) == child_netdev) ||
> + (rtnl_dereference(vbi->backup_netdev) == child_netdev))
> + return dev; /* a match */
> + }
> +
> + return NULL;
> +}
> +
> +/* Called when child dev is injecting data into network stack.
> + * Change the associated network device from lower dev to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
> +{
> + struct sk_buff *skb = *pskb;
> + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +
> + skb->dev = ndev;
> +
> + return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_bypass_register_child(struct net_device *child_netdev)
> +{
> + struct virtnet_bypass_info *vbi;
> + struct net_device *dev;
> + bool backup;
> + int ret;
> +
> + if (child_netdev->addr_len != ETH_ALEN)
> + return NOTIFY_DONE;
> +
> + /* We will use the MAC address to locate the virtnet_bypass netdev
> + * to associate with the child netdev. If we don't find a matching
> + * bypass netdev, move on.
> + */
> + dev = get_virtnet_bypass_bymac(dev_net(child_netdev),
> + child_netdev->perm_addr);
> + if (!dev)
> + return NOTIFY_DONE;
> +
> + vbi = netdev_priv(dev);
> + backup = (child_netdev->dev.parent == dev->dev.parent);
> + if (backup ? rtnl_dereference(vbi->backup_netdev) :
> + rtnl_dereference(vbi->active_netdev)) {
> + netdev_info(dev,
> + "%s attempting to join bypass dev when %s already present\n",
> + child_netdev->name, backup ? "backup" : "active");
> + return NOTIFY_DONE;
> + }
> +
> + /* Avoid non pci devices as active netdev */
> + if (!backup && (!child_netdev->dev.parent ||
> + !dev_is_pci(child_netdev->dev.parent)))
> + return NOTIFY_DONE;
> +
> + ret = netdev_rx_handler_register(child_netdev,
> + virtnet_bypass_handle_frame, dev);
> + if (ret != 0) {
> + netdev_err(child_netdev,
> + "can not register bypass receive handler (err = %d)\n",
> + ret);
> + goto rx_handler_failed;
> + }
> +
> + ret = netdev_upper_dev_link(child_netdev, dev, NULL);
> + if (ret != 0) {
> + netdev_err(child_netdev,
> + "can not set master device %s (err = %d)\n",
> + dev->name, ret);
> + goto upper_link_failed;
> + }
> +
> + child_netdev->flags |= IFF_SLAVE;
> +
> + if (netif_running(dev)) {
> + ret = dev_open(child_netdev);
> + if (ret && (ret != -EBUSY)) {
> + netdev_err(dev, "Opening child %s failed ret:%d\n",
> + child_netdev->name, ret);
> + goto err_interface_up;
> + }
> + }
> +
> + /* Align MTU of child with master */
> + ret = dev_set_mtu(child_netdev, dev->mtu);
> + if (ret) {
> + netdev_err(dev,
> + "unable to change mtu of %s to %u register failed\n",
> + child_netdev->name, dev->mtu);
> + goto err_set_mtu;
> + }
> +
> + call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
> +
> + netdev_info(dev, "registering %s\n", child_netdev->name);
> +
> + dev_hold(child_netdev);
> + if (backup) {
> + rcu_assign_pointer(vbi->backup_netdev, child_netdev);
> + dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
> + } else {
> + rcu_assign_pointer(vbi->active_netdev, child_netdev);
> + dev_get_stats(vbi->active_netdev, &vbi->active_stats);
> + dev->min_mtu = child_netdev->min_mtu;
> + dev->max_mtu = child_netdev->max_mtu;
> + }
> +
> + return NOTIFY_OK;
> +
> +err_set_mtu:
> + dev_close(child_netdev);
> +err_interface_up:
> + netdev_upper_dev_unlink(child_netdev, dev);
> + child_netdev->flags &= ~IFF_SLAVE;
> +upper_link_failed:
> + netdev_rx_handler_unregister(child_netdev);
> +rx_handler_failed:
> + return NOTIFY_DONE;
> +}
> +
> +static int virtnet_bypass_unregister_child(struct net_device *child_netdev)
> +{
> + struct virtnet_bypass_info *vbi;
> + struct net_device *dev, *backup;
> +
> + dev = get_virtnet_bypass_byref(child_netdev);
> + if (!dev)
> + return NOTIFY_DONE;
> +
> + vbi = netdev_priv(dev);
> +
> + netdev_info(dev, "unregistering %s\n", child_netdev->name);
> +
> + netdev_rx_handler_unregister(child_netdev);
> + netdev_upper_dev_unlink(child_netdev, dev);
> + child_netdev->flags &= ~IFF_SLAVE;
> +
> + if (child_netdev->dev.parent == dev->dev.parent) {
> + RCU_INIT_POINTER(vbi->backup_netdev, NULL);
> + } else {
> + RCU_INIT_POINTER(vbi->active_netdev, NULL);
> + backup = rtnl_dereference(vbi->backup_netdev);
> + if (backup) {
> + dev->min_mtu = backup->min_mtu;
> + dev->max_mtu = backup->max_mtu;
> + }
> + }
> +
> + dev_put(child_netdev);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int virtnet_bypass_update_link(struct net_device *child_netdev)
> +{
> + struct net_device *dev, *active, *backup;
> + struct virtnet_bypass_info *vbi;
> +
> + dev = get_virtnet_bypass_byref(child_netdev);
> + if (!dev || !netif_running(dev))
> + return NOTIFY_DONE;
> +
> + vbi = netdev_priv(dev);
> +
> + active = rtnl_dereference(vbi->active_netdev);
> + backup = rtnl_dereference(vbi->backup_netdev);
> +
> + if ((active && virtnet_bypass_xmit_ready(active)) ||
> + (backup && virtnet_bypass_xmit_ready(backup))) {
> + netif_carrier_on(dev);
> + netif_tx_wake_all_queues(dev);
> + } else {
> + netif_carrier_off(dev);
> + netif_tx_stop_all_queues(dev);
> + }
> +
> + return NOTIFY_OK;
> +}
> +
> +static int virtnet_bypass_event(struct notifier_block *this,
> + unsigned long event, void *ptr)
> +{
> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> + /* Skip our own events */
> + if (event_dev->netdev_ops == &virtnet_bypass_netdev_ops)
> + return NOTIFY_DONE;
> +
> + /* Avoid non-Ethernet type devices */
> + if (event_dev->type != ARPHRD_ETHER)
> + return NOTIFY_DONE;
> +
> + /* Avoid Vlan dev with same MAC registering as child dev */
> + if (is_vlan_dev(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid Bonding master dev with same MAC registering as child dev */
> + if ((event_dev->priv_flags & IFF_BONDING) &&
> + (event_dev->flags & IFF_MASTER))
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_REGISTER:
> + return virtnet_bypass_register_child(event_dev);
> + case NETDEV_UNREGISTER:
> + return virtnet_bypass_unregister_child(event_dev);
> + case NETDEV_UP:
> + case NETDEV_DOWN:
> + case NETDEV_CHANGE:
> + return virtnet_bypass_update_link(event_dev);
> + default:
> + return NOTIFY_DONE;
> + }
> +}
> +
> +static struct notifier_block virtnet_bypass_notifier = {
> + .notifier_call = virtnet_bypass_event,
> +};
> +
> +static int virtnet_bypass_create(struct virtnet_info *vi)
> +{
> + struct net_device *backup_netdev = vi->dev;
> + struct device *dev = &vi->vdev->dev;
> + struct net_device *bypass_netdev;
> + int res;
> +
> + /* Alloc at least 2 queues, for now we are going with 16 assuming
> + * that most devices being bonded won't have too many queues.
> + */
> + bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
> + 16);
> + if (!bypass_netdev) {
> + dev_err(dev, "Unable to allocate bypass_netdev!\n");
> + return -ENOMEM;
> + }
> +
> + dev_net_set(bypass_netdev, dev_net(backup_netdev));
> + SET_NETDEV_DEV(bypass_netdev, dev);
> +
> + bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
> + bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
> +
> + /* Initialize the device options */
> + bypass_netdev->flags |= IFF_MASTER;
> + bypass_netdev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT |
> + IFF_NO_QUEUE;
> + bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
> + IFF_TX_SKB_SHARING);
> +
> + /* don't acquire bypass netdev's netif_tx_lock when transmitting */
> + bypass_netdev->features |= NETIF_F_LLTX;
> +
> + /* Don't allow bypass devices to change network namespaces. */
> + bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
> +
> + bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
> + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
> + NETIF_F_HIGHDMA | NETIF_F_LRO;
> +
> + bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
> + bypass_netdev->features |= bypass_netdev->hw_features;
> +
> + /* For now treat bypass netdev as VLAN challenged since we
> + * cannot assume VLAN functionality with a VF
> + */
> + bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
> +
> + memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
> + bypass_netdev->addr_len);
> +
> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
> +
> + res = register_netdev(bypass_netdev);
> + if (res < 0) {
> + dev_err(dev, "Unable to register bypass_netdev!\n");
> + free_netdev(bypass_netdev);
> + return res;
> + }
> +
> + netif_carrier_off(bypass_netdev);
> +
> + vi->bypass_netdev = bypass_netdev;
> +
> + return 0;
> +}
> +
> +static void virtnet_bypass_destroy(struct virtnet_info *vi)
> +{
> + struct net_device *bypass_netdev = vi->bypass_netdev;
> + struct virtnet_bypass_info *vbi;
> + struct net_device *child_netdev;
> +
> + /* no device found, nothing to free */
> + if (!bypass_netdev)
> + return;
> +
> + vbi = netdev_priv(bypass_netdev);
> +
> + netif_device_detach(bypass_netdev);
> +
> + rtnl_lock();
> +
> + child_netdev = rtnl_dereference(vbi->active_netdev);
> + if (child_netdev)
> + virtnet_bypass_unregister_child(child_netdev);
> +
> + child_netdev = rtnl_dereference(vbi->backup_netdev);
> + if (child_netdev)
> + virtnet_bypass_unregister_child(child_netdev);
> +
> + unregister_netdevice(bypass_netdev);
> +
> + rtnl_unlock();
> +
> + free_netdev(bypass_netdev);
> +}
> +
> +/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
> +
> static int virtnet_probe(struct virtio_device *vdev)
> {
> int i, err = -ENOMEM;
> @@ -2797,10 +3466,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>
> virtnet_init_settings(dev);
>
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
> + if (virtnet_bypass_create(vi) != 0)
> + goto free_vqs;
> + }
> +
> err = register_netdev(dev);
> if (err) {
> pr_debug("virtio_net: registering device failed\n");
> - goto free_vqs;
> + goto free_bypass;
> }
>
> virtio_device_ready(vdev);
> @@ -2837,6 +3511,8 @@ static int virtnet_probe(struct virtio_device *vdev)
> vi->vdev->config->reset(vdev);
>
> unregister_netdev(dev);
> +free_bypass:
> + virtnet_bypass_destroy(vi);
> free_vqs:
> cancel_delayed_work_sync(&vi->refill);
> free_receive_page_frags(vi);
> @@ -2871,6 +3547,8 @@ static void virtnet_remove(struct virtio_device *vdev)
>
> unregister_netdev(vi->dev);
>
> + virtnet_bypass_destroy(vi);
> +
> remove_vq_common(vi);
>
> free_netdev(vi->dev);
> @@ -2968,6 +3646,8 @@ static __init int virtio_net_driver_init(void)
> ret = register_virtio_driver(&virtio_net_driver);
> if (ret)
> goto err_virtio;
> +
> + register_netdevice_notifier(&virtnet_bypass_notifier);
> return 0;
> err_virtio:
> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2980,6 +3660,7 @@ module_init(virtio_net_driver_init);
>
> static __exit void virtio_net_driver_exit(void)
> {
> + unregister_netdevice_notifier(&virtnet_bypass_notifier);
> unregister_virtio_driver(&virtio_net_driver);
> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> cpuhp_remove_multi_state(virtionet_online);
> --
> 2.14.3
>
Powered by blists - more mailing lists