lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <54F4741F.50406@cumulusnetworks.com>
Date:	Mon, 02 Mar 2015 06:30:55 -0800
From:	roopa <roopa@...ulusnetworks.com>
To:	sfeldma@...il.com
CC:	netdev@...r.kernel.org, davem@...emloft.net, jiri@...nulli.us
Subject: Re: [PATCH net-next v2 2/4] net: add IPv4 routing FIB support for
 switchdev

On 3/2/15, 2:06 AM, sfeldma@...il.com wrote:
> From: Scott Feldman <sfeldma@...il.com>
>
> Add two new ndo ops (ndo_switch_fib_ipv4_add/del) for switchdev devices
> capable of offloading IPv4 L3 routing function from the kernel.  The ops are
> called by the core IPv4 FIB code when installing/removing/modifying FIB entries
> in the kernel FIB.  On install/modify, the driver should return 0 if FIB entry
> (route) can be installed/modified to device; -EOPNOTSUPP if route cannot be
> installed/modified due to device limitations; and any other negative error code
> on failure to install route to device.  On failure error code, the route is not
> installed to device, and not installed in kernel FIB, and the return code is
> propagated back to the user-space caller (via netlink).  An -EOPNOTSUPP error
> code is skipped for the device but installed in the kernel FIB.
>
> The FIB entry (route) nexthop list is used to find the switchdev netdev to
> anchor the ndo op call.  The route's fib_dev (the first nexthop's dev) is used
> find the switchdev netdev by recursively traversing the fib_dev's lower_dev
> list until a switchdev netdev is found.  The ndo op is called on this switchdev
> netdev.  This downward traversal is necessary for switchdev ports stacked under
> bonds and/or bridges, where the bond or bridge has the L3 interface.
>
> Thw switchdev driver can monitor netevent notifier NETEVENT_NEIGH_UPDATE to
> know neighbor IP addresses which are resolved to a MAC address.  In the case
> where the route's nexthops list contains unresolved neighbor IP addresses, the
> driver can ask the kernel to resolve the neighbor.  As route nexthops are
> resolved, the driver has enough information to program the device for
> L3 forwarding offload.
>
> Signed-off-by: Scott Feldman <sfeldma@...il.com>
> Signed-off-by: Jiri Pirko <jiri@...nulli.us>
> ---
>   include/linux/netdevice.h |   22 +++++++++++
>   include/net/switchdev.h   |   19 +++++++++
>   net/ipv4/fib_trie.c       |   33 ++++++++++++++--
>   net/switchdev/switchdev.c |   95 +++++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 166 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 5897b4e..73b2766 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -769,6 +769,8 @@ struct netdev_phys_item_id {
>   typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   				       struct sk_buff *skb);
>   
> +struct fib_info;
> +
>   /*
>    * This structure defines the management hooks for network devices.
>    * The following hooks can be defined; unless noted otherwise, they are
> @@ -1032,6 +1034,14 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>    * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state);
>    *	Called to notify switch device port of bridge port STP
>    *	state change.
> + * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst,
> + *				     int dst_len, struct fib_info *fi,
> + *				     u8 tos, u8 type, u32 tb_id);
> + *	Called to add/modify IPv4 route to switch device.
> + * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst,
> + *				     int dst_len, struct fib_info *fi,
> + *				     u8 tos, u8 type, u32 tb_id);
> + *	Called to delete IPv4 route from switch device.
>    */
>   struct net_device_ops {
>   	int			(*ndo_init)(struct net_device *dev);
> @@ -1193,6 +1203,18 @@ struct net_device_ops {
>   							    struct netdev_phys_item_id *psid);
>   	int			(*ndo_switch_port_stp_update)(struct net_device *dev,
>   							      u8 state);
> +	int			(*ndo_switch_fib_ipv4_add)(struct net_device *dev,
> +							   __be32 dst,
> +							   int dst_len,
> +							   struct fib_info *fi,
> +							   u8 tos, u8 type,
> +							   u32 tb_id);
> +	int			(*ndo_switch_fib_ipv4_del)(struct net_device *dev,
> +							   __be32 dst,
> +							   int dst_len,
> +							   struct fib_info *fi,
> +							   u8 tos, u8 type,
> +							   u32 tb_id);
>   #endif
>   };
>   
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index cfcdac2..4b2fc3f2 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -51,6 +51,11 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
>   					       struct nlmsghdr *nlh, u16 flags);
>   int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
>   					       struct nlmsghdr *nlh, u16 flags);
> +int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
> +			       u8 tos, u8 type, u32 tb_id);
> +int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
> +			       u8 tos, u8 type, u32 tb_id);
> +
>   #else
>   
>   static inline int netdev_switch_parent_id_get(struct net_device *dev,
> @@ -109,6 +114,20 @@ static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *
>   	return 0;
>   }
>   
> +static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len,
> +					     struct fib_info *fi,
> +					     u8 tos, u8 type, u32 tb_id)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len,
> +					     struct fib_info *fi,
> +					     u8 tos, u8 type, u32 tb_id)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>   #endif
>   
>   #endif /* _LINUX_SWITCHDEV_H_ */
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index f485345..b834e9c 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -79,6 +79,7 @@
>   #include <net/tcp.h>
>   #include <net/sock.h>
>   #include <net/ip_fib.h>
> +#include <net/switchdev.h>
>   #include "fib_lookup.h"
>   
>   #define MAX_STAT_DEPTH 32
> @@ -1161,7 +1162,17 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
>   			new_fa->fa_state = state & ~FA_S_ACCESSED;
>   			new_fa->fa_slen = fa->fa_slen;
>   
> +			err = netdev_switch_fib_ipv4_add(key, plen, fi,
> +							 new_fa->fa_tos,
> +							 cfg->fc_type,
> +							 tb->tb_id);
> +			if (err && err != -EOPNOTSUPP) {
> +				kmem_cache_free(fn_alias_kmem, new_fa);
> +				goto out;
> +			}
> +
>   			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
> +
>   			alias_free_mem_rcu(fa);
>   
>   			fib_release_info(fi_drop);

This looks like the replace case: It will need a 
netdev_switch_fib_ipv4_del for fi_drop ?
> @@ -1197,12 +1208,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
>   	new_fa->fa_state = 0;
>   	new_fa->fa_slen = slen;
>   
> +	/* (Optionally) offload fib entry to switch hardware. */
> +	err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
> +					 cfg->fc_type, tb->tb_id);

This could be an NLM_F_APPEND case. Would be better for the switchdev 
API to also take
nlflags as argument, to inform the switch driver of replace and append 
cases.
> +	if (err && err != -EOPNOTSUPP)
> +		goto out_free_new_fa;
> +
>   	/* Insert new entry to the list. */
>   	if (!l) {
>   		l = fib_insert_node(t, key, plen);
>   		if (unlikely(!l)) {
>   			err = -ENOMEM;
> -			goto out_free_new_fa;
> +			goto out_sw_fib_del;
>   		}
>   	}
>   
> @@ -1217,6 +1234,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
>   succeeded:
>   	return 0;
>   
> +out_sw_fib_del:
> +	netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
>   out_free_new_fa:
>   	kmem_cache_free(fn_alias_kmem, new_fa);
>   out:
> @@ -1475,6 +1494,10 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
>   		return -ESRCH;
>   
>   	fa = fa_to_delete;
> +
> +	netdev_switch_fib_ipv4_del(key, plen, fa->fa_info, tos,
> +				   cfg->fc_type, tb->tb_id);
> +
>   	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
>   		  &cfg->fc_nlinfo, 0);
>   
> @@ -1494,7 +1517,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
>   	return 0;
>   }
>   
> -static int trie_flush_leaf(struct tnode *l)
> +static int trie_flush_leaf(struct fib_table *tb, struct tnode *l)
>   {
>   	struct hlist_node *tmp;
>   	unsigned char slen = 0;
> @@ -1505,6 +1528,10 @@ static int trie_flush_leaf(struct tnode *l)
>   		struct fib_info *fi = fa->fa_info;
>   
>   		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
> +			netdev_switch_fib_ipv4_del(l->key,
> +						   KEYLENGTH - fa->fa_slen,
> +						   fi, fa->fa_tos,
> +						   fa->fa_type, tb->tb_id);
>   			hlist_del_rcu(&fa->fa_list);
>   			fib_release_info(fa->fa_info);
>   			alias_free_mem_rcu(fa);
> @@ -1593,7 +1620,7 @@ int fib_table_flush(struct fib_table *tb)
>   	int found = 0;
>   
>   	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
> -		found += trie_flush_leaf(l);
> +		found += trie_flush_leaf(tb, l);
>   
>   		if (ll) {
>   			if (hlist_empty(&ll->leaf))
> diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
> index 8c1e558..a84bdb4 100644
> --- a/net/switchdev/switchdev.c
> +++ b/net/switchdev/switchdev.c
> @@ -14,6 +14,7 @@
>   #include <linux/mutex.h>
>   #include <linux/notifier.h>
>   #include <linux/netdevice.h>
> +#include <net/ip_fib.h>
>   #include <net/switchdev.h>
>   
>   /**
> @@ -225,3 +226,97 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
>   	return ret;
>   }
>   EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_dellink);
> +
> +static struct net_device *netdev_switch_get_by_fib_dev(struct net_device *dev)
> +{
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +	struct net_device *lower_dev;
> +	struct net_device *port_dev;
> +	struct list_head *iter;
> +
> +	/* Recusively search from fib_dev down until we find
> +	 * a sw port dev.  (A sw port dev supports
> +	 * ndo_switch_parent_id_get).
> +	 */
> +
> +	if (ops->ndo_switch_parent_id_get)
> +		return dev;

Maybe we can just check for NETIF_F_HW_SWITCH_OFFLOAD here ?
similar to netdev_switch_port_bridge_newlink/dellink

> +
> +	netdev_for_each_lower_dev(dev, lower_dev, iter) {
> +		port_dev = netdev_switch_get_by_fib_dev(lower_dev);
> +		if (port_dev)
> +			return port_dev;
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + *	netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
> + *
> + *	@dst: route's IPv4 destination address
> + *	@dst_len: destination address length (prefix length)
> + *	@fi: route FIB info structure
> + *	@tos: route TOS
> + *	@type: route type
> + *	@tb_id: route table ID
> + *
> + *	Add IPv4 route entry to switch device.
> + */
> +int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
> +			       u8 tos, u8 type, u32 tb_id)
> +{
> +	struct net_device *dev;
> +	const struct net_device_ops *ops;
> +	int err = -EOPNOTSUPP;
> +
> +	dev = netdev_switch_get_by_fib_dev(fi->fib_dev);
> +	if (!dev)
> +		return -EOPNOTSUPP;
> +	ops = dev->netdev_ops;
> +
> +	if (ops->ndo_switch_fib_ipv4_add)
> +		err = ops->ndo_switch_fib_ipv4_add(dev, htonl(dst), dst_len,
> +						   fi, tos, type, tb_id);
> +
> +	if (!err)
> +		fi->fib_flags |= RTNH_F_EXTERNAL;
> +
> +	return err;
> +}
> +EXPORT_SYMBOL(netdev_switch_fib_ipv4_add);
> +
> +/**
> + *	netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
> + *
> + *	@dst: route's IPv4 destination address
> + *	@dst_len: destination address length (prefix length)
> + *	@fi: route FIB info structure
> + *	@tos: route TOS
> + *	@type: route type
> + *	@tb_id: route table ID
> + *
> + *	Delete IPv4 route entry from switch device.
> + */
> +int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
> +			       u8 tos, u8 type, u32 tb_id)
> +{
> +	struct net_device *dev;
> +	const struct net_device_ops *ops;
> +	int err = -EOPNOTSUPP;
> +
> +	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
> +		return -EOPNOTSUPP;
> +
> +	dev = netdev_switch_get_by_fib_dev(fi->fib_dev);
> +	if (!dev)
> +		return -EOPNOTSUPP;
> +	ops = dev->netdev_ops;
> +
> +	if (ops->ndo_switch_fib_ipv4_del)
> +		err = ops->ndo_switch_fib_ipv4_del(dev, htonl(dst), dst_len,
> +						   fi, tos, type, tb_id);
> +
> +	return err;
> +}
> +EXPORT_SYMBOL(netdev_switch_fib_ipv4_del);

Rest looks great!. We can extend the switchdev api as needed in the future.

Thanks scott.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ