[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170105125656.GB2211@nanopsycho>
Date: Thu, 5 Jan 2017 13:56:56 +0100
From: Jiri Pirko <jiri@...nulli.us>
To: Sridhar Samudrala <sridhar.samudrala@...el.com>
Cc: alexander.h.duyck@...el.com, john.r.fastabend@...el.com,
anjali.singhai@...el.com, jakub.kicinski@...ronome.com,
davem@...emloft.net, intel-wired-lan@...ts.osuosl.org,
netdev@...r.kernel.org
Subject: Re: [net-next PATCH v2 5/6] i40e: Add TX and RX support in switchdev
mode.
Tue, Jan 03, 2017 at 07:07:53PM CET, sridhar.samudrala@...el.com wrote:
>In switchdev mode, broadcast filter is not enabled on VFs. The broadcasts and
>unknown frames from VFs are received by the PF and passed to corresponding VF
>port representator netdev.
>A host based switching entity like a linux bridge or OVS redirects these frames
>to the right VFs via VFPR netdevs. Any frames sent via VFPR netdevs are sent as
>directed transmits to the corresponding VFs. To enable directed transmit, skb
>metadata dst is used to pass the VF id and the frame is requeued to call the PFs
>transmit routine.
>
>Small script to demonstrate inter VF pings in switchdev mode.
>PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1
>
># rmmod i40e; modprobe i40e
># devlink dev eswitch set pci/0000:05:00.0 mode switchdev
># echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
># ip link set enp5s0f0 vf 0 mac 00:11:22:33:44:55
># ip link set enp5s0f0 vf 1 mac 00:11:22:33:44:56
># rmmod i40evf; modprobe i40evf
>
>/* Create 2 namespaces and move the VFs to the corresponding ns. */
># ip netns add ns0
># ip link set enp5s2 netns ns0
># ip netns exec ns0 ip addr add 192.168.1.10/24 dev enp5s2
># ip netns exec ns0 ip link set enp5s2 up
># ip netns add ns1
># ip link set enp5s2f1 netns ns1
># ip netns exec ns1 ip addr add 192.168.1.11/24 dev enp5s2f1
># ip netns exec ns1 ip link set enp5s2f1 up
>
>/* bring up pf and vfpr netdevs */
># ip link set enp5s0f0 up
># ip link set enp5s0f0-vf0 up
># ip link set enp5s0f0-vf1 up
>
>/* Create a linux bridge and add vfpr netdevs to it. */
># ip link add vfpr-br type bridge
># ip link set enp5s0f0-vf0 master vfpr-br
># ip link set enp5s0f0-vf1 master vfpr-br
># ip addr add 192.168.1.1/24 dev vfpr-br
># ip link set vfpr-br up
>
># ip netns exec ns0 ping -c3 192.168.1.11
># ip netns exec ns1 ping -c3 192.168.1.10
>
>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@...el.com>
[...]
>diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>index 352cf7c..b46ddaa 100644
>--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>@@ -1176,16 +1176,37 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
> * @rx_ring: rx ring in play
> * @skb: packet to send up
> * @vlan_tag: vlan tag for packet
>+ * @lpbk: is it a loopback frame?
> **/
> static void i40e_receive_skb(struct i40e_ring *rx_ring,
>- struct sk_buff *skb, u16 vlan_tag)
>+ struct sk_buff *skb, u16 vlan_tag, bool lpbk)
> {
> struct i40e_q_vector *q_vector = rx_ring->q_vector;
>+ struct i40e_pf *pf = rx_ring->vsi->back;
>+ struct i40e_vf *vf;
>+ struct ethhdr *eth;
>+ int vf_id;
>
> if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
> (vlan_tag & VLAN_VID_MASK))
> __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
>
>+ if ((pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY) || !lpbk)
>+ goto gro_receive;
>+
>+ /* If a loopback packet is received from a VF in switchdev mode, pass the
>+ * frame to the corresponding VFPR netdev based on the source MAC in the frame.
>+ */
>+ eth = (struct ethhdr *)skb_mac_header(skb);
>+ for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
>+ vf = &pf->vf[vf_id];
>+ if (ether_addr_equal(eth->h_source, vf->default_lan_addr.addr)) {
>+ skb->dev = vf->vfpr_netdev;
This sucks :( Is't there any identification coming from rx ring that
would tell you what vf this is? To match vrpr according to a single MAC
address seems a bit awkward. What if there is a macvlan configured
on the VF?
>+ break;
>+ }
>+ }
>+
>+gro_receive:
> napi_gro_receive(&q_vector->napi, skb);
> }
>
[...]
>@@ -2998,3 +3064,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>
> return i40e_xmit_frame_ring(skb, tx_ring);
> }
>+
>+netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb,
>+ struct net_device *dev)
>+{
>+ struct i40e_vfpr_netdev_priv *priv = netdev_priv(dev);
>+ struct i40e_vf *vf = priv->vf;
>+ struct i40e_pf *pf = vf->pf;
>+ struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
>+
>+ skb_dst_drop(skb);
>+ dst_hold(&priv->vfpr_dst->dst);
>+ skb_dst_set(skb, &priv->vfpr_dst->dst);
>+ skb->dev = vsi->netdev;
This dst dance seems a bit odd to me. Why don't you just call
i40e_xmit_frame_ring with an extra arg holding the needed metadata?
>+
>+ return dev_queue_xmit(skb);
>+}
>diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>index e065321..850723f 100644
>--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>@@ -366,6 +366,7 @@ struct i40e_ring_container {
>
> bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count);
> netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
>+netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev);
> void i40e_clean_tx_ring(struct i40e_ring *tx_ring);
> void i40e_clean_rx_ring(struct i40e_ring *rx_ring);
> int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring);
>diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
>index edc0abd..c136cc9 100644
>--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
>+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
>@@ -728,6 +728,9 @@ enum i40e_rx_desc_status_bits {
> #define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT I40E_RX_DESC_STATUS_TSYNVALID_SHIFT
> #define I40E_RXD_QW1_STATUS_TSYNVALID_MASK \
> BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT)
>+#define I40E_RXD_QW1_STATUS_LPBK_SHIFT I40E_RX_DESC_STATUS_LPBK_SHIFT
>+#define I40E_RXD_QW1_STATUS_LPBK_MASK \
>+ BIT_ULL(I40E_RXD_QW1_STATUS_LPBK_SHIFT)
>
> enum i40e_rx_desc_fltstat_values {
> I40E_RX_DESC_FLTSTAT_NO_DATA = 0,
>diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>index 0c8687d..f0860ef 100644
>--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>@@ -1062,6 +1062,7 @@ static int i40e_vfpr_netdev_stop(struct net_device *dev)
> static const struct net_device_ops i40e_vfpr_netdev_ops = {
> .ndo_open = i40e_vfpr_netdev_open,
> .ndo_stop = i40e_vfpr_netdev_stop,
>+ .ndo_start_xmit = i40e_vfpr_netdev_start_xmit,
> };
>
> /**
>@@ -1121,16 +1122,22 @@ int i40e_alloc_vfpr_netdev(struct i40e_vf *vf, u16 vf_num)
>
> priv = netdev_priv(vfpr_netdev);
> priv->vf = &(pf->vf[vf_num]);
>+ priv->vfpr_dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL);
I'm missing a patch here. In net-next, metadata_dst_alloc has 2 args.
Powered by blists - more mailing lists