[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20171221021155-mutt-send-email-mst@kernel.org>
Date: Thu, 21 Dec 2017 02:14:46 +0200
From: "Michael S. Tsirkin" <mst@...hat.com>
To: Sridhar Samudrala <sridhar.samudrala@...el.com>
Cc: stephen@...workplumber.org, netdev@...r.kernel.org,
virtualization@...ts.linux-foundation.org,
alexander.duyck@...il.com
Subject: Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when
available
On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> This patch enables virtio to switch over to a VF datapath when a VF netdev
> is present with the same MAC address.
I prefer saying "a passthrough device" here. Does not have to be a VF at
all.
> It allows live migration of a VM
> with a direct attached VF without the need to setup a bond/team between a
> VF and virtio net device in the guest.
>
> The hypervisor needs to unplug the VF device from the guest on the source
> host and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed, the
> destination hypervisor sets the MAC filter on the VF and plugs it back to
> the guest to switch over to VF datapath.
>
> It is entirely based on netvsc implementation and it should be possible to
> make this code generic and move it to a common location that can be shared
> by netvsc and virtio.
>
> Also, i think we should make this a negotiated feature that is off by
> default via a new feature bit.
So please include this. A copy needs to go to virtio TC
to reserve the bit. Enabling this by default risks breaking
too many configurations.
>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@...el.com>
> ---
> drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 339 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 559b215c0169..a34c717bb15b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,8 @@
> #include <linux/average.h>
> #include <linux/filter.h>
> #include <net/route.h>
> +#include <linux/netdevice.h>
> +#include <linux/netpoll.h>
>
> static int napi_weight = NAPI_POLL_WEIGHT;
> module_param(napi_weight, int, 0444);
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> */
> DECLARE_EWMA(pkt_len, 0, 64)
>
> +#define VF_TAKEOVER_INT (HZ / 10)
> +
> #define VIRTNET_DRIVER_VERSION "1.0.0"
>
> static const unsigned long guest_offloads[] = {
> @@ -117,6 +121,15 @@ struct receive_queue {
> char name[40];
> };
>
> +struct virtnet_vf_pcpu_stats {
> + u64 rx_packets;
> + u64 rx_bytes;
> + u64 tx_packets;
> + u64 tx_bytes;
> + struct u64_stats_sync syncp;
> + u32 tx_dropped;
> +};
> +
> struct virtnet_info {
> struct virtio_device *vdev;
> struct virtqueue *cvq;
> @@ -179,6 +192,11 @@ struct virtnet_info {
> u32 speed;
>
> unsigned long guest_offloads;
> +
> + /* State to manage the associated VF interface. */
> + struct net_device __rcu *vf_netdev;
> + struct virtnet_vf_pcpu_stats __percpu *vf_stats;
> + struct delayed_work vf_takeover;
> };
>
> struct padded_vnet_hdr {
> @@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
> }
>
> +/* Send skb on the slave VF device. */
> +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
> + struct sk_buff *skb)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + unsigned int len = skb->len;
> + int rc;
> +
> + skb->dev = vf_netdev;
> + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> + rc = dev_queue_xmit(skb);
> + if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
> + struct virtnet_vf_pcpu_stats *pcpu_stats
> + = this_cpu_ptr(vi->vf_stats);
> +
> + u64_stats_update_begin(&pcpu_stats->syncp);
> + pcpu_stats->tx_packets++;
> + pcpu_stats->tx_bytes += len;
> + u64_stats_update_end(&pcpu_stats->syncp);
> + } else {
> + this_cpu_inc(vi->vf_stats->tx_dropped);
> + }
> +
> + return rc;
> +}
> +
> static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> int qnum = skb_get_queue_mapping(skb);
> struct send_queue *sq = &vi->sq[qnum];
> + struct net_device *vf_netdev;
> int err;
> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> bool kick = !skb->xmit_more;
> bool use_napi = sq->napi.weight;
>
> + /* if VF is present and up then redirect packets
> + * called with rcu_read_lock_bh
> + */
> + vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> + if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
> + return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
> /* Free up any pending old buffers before queueing new ones. */
> free_old_xmit_skbs(sq);
>
> @@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
> return ret;
> }
>
> +static void virtnet_get_vf_stats(struct net_device *dev,
> + struct virtnet_vf_pcpu_stats *tot)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + int i;
> +
> + memset(tot, 0, sizeof(*tot));
> +
> + for_each_possible_cpu(i) {
> + const struct virtnet_vf_pcpu_stats *stats
> + = per_cpu_ptr(vi->vf_stats, i);
> + u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
> + unsigned int start;
> +
> + do {
> + start = u64_stats_fetch_begin_irq(&stats->syncp);
> + rx_packets = stats->rx_packets;
> + tx_packets = stats->tx_packets;
> + rx_bytes = stats->rx_bytes;
> + tx_bytes = stats->tx_bytes;
> + } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> + tot->rx_packets += rx_packets;
> + tot->tx_packets += tx_packets;
> + tot->rx_bytes += rx_bytes;
> + tot->tx_bytes += tx_bytes;
> + tot->tx_dropped += stats->tx_dropped;
> + }
> +}
> +
> static void virtnet_stats(struct net_device *dev,
> struct rtnl_link_stats64 *tot)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> + struct virtnet_vf_pcpu_stats vf_stats;
> int cpu;
> unsigned int start;
>
> @@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
> tot->rx_dropped = dev->stats.rx_dropped;
> tot->rx_length_errors = dev->stats.rx_length_errors;
> tot->rx_frame_errors = dev->stats.rx_frame_errors;
> +
> + virtnet_get_vf_stats(dev, &vf_stats);
> + tot->rx_packets += vf_stats.rx_packets;
> + tot->tx_packets += vf_stats.tx_packets;
> + tot->rx_bytes += vf_stats.rx_bytes;
> + tot->tx_bytes += vf_stats.tx_bytes;
> + tot->tx_dropped += vf_stats.tx_dropped;
> }
>
> #ifdef CONFIG_NET_POLL_CONTROLLER
> @@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
> return 0;
> }
>
> +static void __virtnet_vf_setup(struct net_device *ndev,
> + struct net_device *vf_netdev)
> +{
> + int ret;
> +
> + /* Align MTU of VF with master */
> + ret = dev_set_mtu(vf_netdev, ndev->mtu);
> + if (ret)
> + netdev_warn(vf_netdev,
> + "unable to change mtu to %u\n", ndev->mtu);
> +
> + if (netif_running(ndev)) {
> + ret = dev_open(vf_netdev);
> + if (ret)
> + netdev_warn(vf_netdev,
> + "unable to open: %d\n", ret);
> + }
> +}
> +
> +/* Setup VF as slave of the virtio device.
> + * Runs in workqueue to avoid recursion in netlink callbacks.
> + */
> +static void virtnet_vf_setup(struct work_struct *w)
> +{
> + struct virtnet_info *vi
> + = container_of(w, struct virtnet_info, vf_takeover.work);
> + struct net_device *ndev = vi->dev;
> + struct net_device *vf_netdev;
> +
> + if (!rtnl_trylock()) {
> + schedule_delayed_work(&vi->vf_takeover, 0);
> + return;
> + }
> +
> + vf_netdev = rtnl_dereference(vi->vf_netdev);
> + if (vf_netdev)
> + __virtnet_vf_setup(ndev, vf_netdev);
> +
> + rtnl_unlock();
> +}
> +
> static int virtnet_probe(struct virtio_device *vdev)
> {
> int i, err;
> @@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
> }
>
> INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> + INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
> +
> + vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
> + if (!vi->vf_stats)
> + goto free_stats;
>
> /* If we can receive ANY GSO packets, we must allocate large ones. */
> if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> */
> dev_err(&vdev->dev, "device MTU appears to have changed "
> "it is now %d < %d", mtu, dev->min_mtu);
> - goto free_stats;
> + goto free_vf_stats;
> }
>
> dev->mtu = mtu;
> @@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
> err = init_vqs(vi);
> if (err)
> - goto free_stats;
> + goto free_vf_stats;
>
> #ifdef CONFIG_SYSFS
> if (vi->mergeable_rx_bufs)
> @@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
> cancel_delayed_work_sync(&vi->refill);
> free_receive_page_frags(vi);
> virtnet_del_vqs(vi);
> +free_vf_stats:
> + free_percpu(vi->vf_stats);
> free_stats:
> free_percpu(vi->stats);
> free:
> @@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
> virtnet_del_vqs(vi);
> }
>
> +static struct net_device *get_virtio_bymac(const u8 *mac)
> +{
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(&init_net, dev) {
> + if (dev->netdev_ops != &virtnet_netdev)
> + continue; /* not a virtio_net device */
> +
> + if (ether_addr_equal(mac, dev->perm_addr))
> + return dev;
> + }
> +
> + return NULL;
> +}
> +
> +static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
> +{
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(&init_net, dev) {
> + struct virtnet_info *vi;
> +
> + if (dev->netdev_ops != &virtnet_netdev)
> + continue; /* not a virtio_net device */
> +
> + vi = netdev_priv(dev);
> + if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
> + return dev; /* a match */
> + }
> +
> + return NULL;
> +}
> +
> +/* Called when VF is injecting data into network stack.
> + * Change the associated network device from VF to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
> +{
> + struct sk_buff *skb = *pskb;
> + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> + struct virtnet_info *vi = netdev_priv(ndev);
> + struct virtnet_vf_pcpu_stats *pcpu_stats =
> + this_cpu_ptr(vi->vf_stats);
> +
> + skb->dev = ndev;
> +
> + u64_stats_update_begin(&pcpu_stats->syncp);
> + pcpu_stats->rx_packets++;
> + pcpu_stats->rx_bytes += skb->len;
> + u64_stats_update_end(&pcpu_stats->syncp);
> +
> + return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_vf_join(struct net_device *vf_netdev,
> + struct net_device *ndev)
> +{
> + struct virtnet_info *vi = netdev_priv(ndev);
> + int ret;
> +
> + ret = netdev_rx_handler_register(vf_netdev,
> + virtnet_vf_handle_frame, ndev);
> + if (ret != 0) {
> + netdev_err(vf_netdev,
> + "can not register virtio VF receive handler (err = %d)\n",
> + ret);
> + goto rx_handler_failed;
> + }
> +
> + ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
> + if (ret != 0) {
> + netdev_err(vf_netdev,
> + "can not set master device %s (err = %d)\n",
> + ndev->name, ret);
> + goto upper_link_failed;
> + }
> +
> + /* set slave flag before open to prevent IPv6 addrconf */
> + vf_netdev->flags |= IFF_SLAVE;
> +
> + schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
> +
> + call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
> +
> + netdev_info(vf_netdev, "joined to %s\n", ndev->name);
> + return 0;
> +
> +upper_link_failed:
> + netdev_rx_handler_unregister(vf_netdev);
> +rx_handler_failed:
> + return ret;
> +}
> +
> +static int virtnet_register_vf(struct net_device *vf_netdev)
> +{
> + struct net_device *ndev;
> + struct virtnet_info *vi;
> +
> + if (vf_netdev->addr_len != ETH_ALEN)
> + return NOTIFY_DONE;
> +
> + /* We will use the MAC address to locate the virtio_net interface to
> + * associate with the VF interface. If we don't find a matching
> + * virtio interface, move on.
> + */
> + ndev = get_virtio_bymac(vf_netdev->perm_addr);
> + if (!ndev)
> + return NOTIFY_DONE;
> +
> + vi = netdev_priv(ndev);
> + if (rtnl_dereference(vi->vf_netdev))
> + return NOTIFY_DONE;
> +
> + if (virtnet_vf_join(vf_netdev, ndev) != 0)
> + return NOTIFY_DONE;
> +
> + netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
> +
> + dev_hold(vf_netdev);
> + rcu_assign_pointer(vi->vf_netdev, vf_netdev);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int virtnet_unregister_vf(struct net_device *vf_netdev)
> +{
> + struct net_device *ndev;
> + struct virtnet_info *vi;
> +
> + ndev = get_virtio_byref(vf_netdev);
> + if (!ndev)
> + return NOTIFY_DONE;
> +
> + vi = netdev_priv(ndev);
> + cancel_delayed_work_sync(&vi->vf_takeover);
> +
> + netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
> +
> + netdev_rx_handler_unregister(vf_netdev);
> + netdev_upper_dev_unlink(vf_netdev, ndev);
> + RCU_INIT_POINTER(vi->vf_netdev, NULL);
> + dev_put(vf_netdev);
> +
> + return NOTIFY_OK;
> +}
> +
> static void virtnet_remove(struct virtio_device *vdev)
> {
> struct virtnet_info *vi = vdev->priv;
> + struct net_device *vf_netdev;
>
> virtnet_cpu_notif_remove(vi);
>
> /* Make sure no work handler is accessing the device. */
> flush_work(&vi->config_work);
>
> + rtnl_lock();
> + vf_netdev = rtnl_dereference(vi->vf_netdev);
> + if (vf_netdev)
> + virtnet_unregister_vf(vf_netdev);
> + rtnl_unlock();
> +
> unregister_netdev(vi->dev);
>
> remove_vq_common(vi);
>
> + free_percpu(vi->vf_stats);
> free_percpu(vi->stats);
> free_netdev(vi->dev);
> }
> @@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
> #endif
> };
>
> +static int virtio_netdev_event(struct notifier_block *this,
> + unsigned long event, void *ptr)
> +{
> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> + /* Skip our own events */
> + if (event_dev->netdev_ops == &virtnet_netdev)
> + return NOTIFY_DONE;
> +
> + /* Avoid non-Ethernet type devices */
> + if (event_dev->type != ARPHRD_ETHER)
> + return NOTIFY_DONE;
> +
> + /* Avoid Vlan dev with same MAC registering as VF */
> + if (is_vlan_dev(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid Bonding master dev with same MAC registering as VF */
> + if ((event_dev->priv_flags & IFF_BONDING) &&
> + (event_dev->flags & IFF_MASTER))
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_REGISTER:
> + return virtnet_register_vf(event_dev);
> + case NETDEV_UNREGISTER:
> + return virtnet_unregister_vf(event_dev);
> + default:
> + return NOTIFY_DONE;
> + }
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> + .notifier_call = virtio_netdev_event,
> +};
> +
> static __init int virtio_net_driver_init(void)
> {
> int ret;
> @@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
> ret = register_virtio_driver(&virtio_net_driver);
> if (ret)
> goto err_virtio;
> +
> + register_netdevice_notifier(&virtio_netdev_notifier);
> return 0;
> err_virtio:
> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
>
> static __exit void virtio_net_driver_exit(void)
> {
> + unregister_netdevice_notifier(&virtio_netdev_notifier);
> unregister_virtio_driver(&virtio_net_driver);
> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> cpuhp_remove_multi_state(virtionet_online);
> --
> 2.14.3
Powered by blists - more mailing lists