[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080422211512.GQ21637@solarflare.com>
Date: Tue, 22 Apr 2008 22:15:13 +0100
From: Ben Hutchings <bhutchings@...arflare.com>
To: Kieran Mansley <kmansley@...arflare.com>
Cc: Stephen Hemminger <shemminger@...tta.com>, netdev@...r.kernel.org
Subject: Re: LRO/GSO interaction when packets are forwarded
Kieran Mansley wrote:
> On Fri, 2008-03-07 at 08:25 -0800, Stephen Hemminger wrote:
> > On Fri, 07 Mar 2008 14:09:57 +0000
> > Kieran Mansley <kmansley@...arflare.com> wrote:
> >
> > > We've seen a couple of problems when using a bridge or IP forwarding
> > > combined with LRO packets generated by a network device driver. As you
> > > know, LRO packets can be either be page based (and passed up with
> > > lro_receive_page()) or use the skb frag_list (and passed up with
> > > lro_receive_skb()). In both cases it is likely that the device driver
> > > will have set CHECKSUM_UNNECESSARY to indicate that the packet has been
> > > checksummed by the device, and gso_size to mark it as an LRO packet and
> > > indicate the actual received MSS.
> >
> > First off, no hardware should ever do LRO on non-local packets. If the
> > hardware isn't smart enough to do this, I guess the bridge code to have
> > an API to turn it off. IP should also turn it off if ip_forwarding
> > is enabled on that device.
>
> If the only way to deal with this is to prevent LRO in those cases,
> having an API to turn if off would clearly be helpful: working out in
> the hardware or driver which packets can be forwarded or not is hard,
> and would probably be a layering violation in itself.
<snip>
Here's a first try at disabling LRO where it should not be used. I've
given it a little testing and would appreciate comments on whether this
is a reasonable approach.
There's one piece clearly missing: since the disabling of LRO can race
with packet reception, and since LRO can potentially be reenabled with
ethtool, the IP forwarding and bridging code needs to detect and drop
LRO'd packets.
Ben.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7c1d446..d7e8f1f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -866,6 +866,7 @@ extern struct net_device *__dev_get_by_name(struct net *net, const char *name);
extern int dev_alloc_name(struct net_device *dev, const char *name);
extern int dev_open(struct net_device *dev);
extern int dev_close(struct net_device *dev);
+extern void dev_disable_lro(struct net_device *dev);
extern int dev_queue_xmit(struct sk_buff *skb);
extern int register_netdevice(struct net_device *dev);
extern void unregister_netdevice(struct net_device *dev);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 298e0f4..1bd6631 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -387,6 +387,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
goto err2;
rcu_assign_pointer(dev->br_port, p);
+ dev_disable_lro(dev);
dev_set_promiscuity(dev, 1);
list_add_rcu(&p->list, &br->port_list);
diff --git a/net/core/dev.c b/net/core/dev.c
index e1df1ab..62cca32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -90,6 +90,7 @@
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
#include <linux/notifier.h>
#include <linux/skbuff.h>
#include <net/net_namespace.h>
@@ -1108,6 +1109,29 @@ int dev_close(struct net_device *dev)
}
+/**
+ * dev_disable_lro - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
+ dev->ethtool_ops->set_flags) {
+ u32 flags = dev->ethtool_ops->get_flags(dev);
+ if (flags & ETH_FLAG_LRO) {
+ flags &= ~ETH_FLAG_LRO;
+ dev->ethtool_ops->set_flags(dev, flags);
+ }
+ }
+ WARN_ON(dev->features & NETIF_F_LRO);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+
static int dev_boot_phase = 1;
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6848e47..f88d395 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -171,6 +171,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
in_dev->dev = dev;
if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ dev_disable_lro(dev);
/* Reference in_dev->dev */
dev_hold(dev);
/* Account for reference dev->ip_ptr (below) */
@@ -1250,6 +1252,8 @@ static void inet_forward_change(struct net *net)
read_lock(&dev_base_lock);
for_each_netdev(net, dev) {
struct in_device *in_dev;
+ if (on)
+ dev_disable_lro(dev);
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev)
@@ -1257,8 +1261,6 @@ static void inet_forward_change(struct net *net)
rcu_read_unlock();
}
read_unlock(&dev_base_lock);
-
- rt_cache_flush(0);
}
static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1344,10 +1346,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
if (write && *valp != val) {
struct net *net = ctl->extra2;
- if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING))
- inet_forward_change(net);
- else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING))
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ rtnl_lock();
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else if (*valp) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ dev_disable_lro(idev->dev);
+ }
+ rtnl_unlock();
rt_cache_flush(0);
+ }
}
return ret;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8a0fd40..5be6874 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -344,6 +344,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
kfree(ndev);
return NULL;
}
+ if (ndev->cnf.forwarding)
+ dev_disable_lro(dev);
/* We refer to the device */
dev_hold(dev);
@@ -438,6 +440,8 @@ static void dev_forward_change(struct inet6_dev *idev)
if (!idev)
return;
dev = idev->dev;
+ if (idev->cnf.forwarding)
+ dev_disable_lro(dev);
if (dev && (dev->flags & IFF_MULTICAST)) {
if (idev->cnf.forwarding)
ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
@@ -483,12 +487,14 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
if (p == &net->ipv6.devconf_dflt->forwarding)
return;
+ rtnl_lock();
if (p == &net->ipv6.devconf_all->forwarding) {
__s32 newf = net->ipv6.devconf_all->forwarding;
net->ipv6.devconf_dflt->forwarding = newf;
addrconf_forward_change(net, newf);
} else if ((!*p) ^ (!old))
dev_forward_change((struct inet6_dev *)table->extra1);
+ rtnl_unlock();
if (*p)
rt6_purge_dflt_routers(net);
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists