[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACzMAJKSo+8hsBF-oh=piJ2dMHT+-qntqzzdv2f190Lv0MZzjw@mail.gmail.com>
Date: Thu, 24 Jul 2014 13:54:09 -0700
From: Andy Zhou <azhou@...ira.com>
To: Tom Herbert <therbert@...gle.com>
Cc: David Miller <davem@...emloft.net>,
Linux Netdev List <netdev@...r.kernel.org>
Subject: Re: [net-next 02/10] udp: Expand UDP tunnel common APIs
On Thu, Jul 24, 2014 at 1:47 PM, Tom Herbert <therbert@...gle.com> wrote:
> On Thu, Jul 24, 2014 at 1:23 PM, Andy Zhou <azhou@...ira.com> wrote:
>> The general layering I see is tunnel_user (i.e. OVS) -> tuunel_driver
>> (i.e. vxlan) -> udp_tunnel.
>>
> Simpler and more efficient if you stick with UDP->UDP_encap_handler as
> the most general model for RX.
I believe this is the case now. I don't plan to change this. Just not
exposing the
higher layer callback to the udp_tunnel layer.
>
>> The two receive functions are from two separate layers above
>> udp_tunnel. I can restructure the APIs to make it
>> cleaner.
>>
> The only necessary function for opening the UDP encap port is the UDP
> receive handler (encap receive). If you want to implement more
> indirection within your handler then it should be pretty easy to
> create another layer of API for that purpose.
>
Yes, this is the direction I am going towards.
>> On Wed, Jul 23, 2014 at 12:57 PM, Tom Herbert <therbert@...gle.com> wrote:
>>> On Tue, Jul 22, 2014 at 3:19 AM, Andy Zhou <azhou@...ira.com> wrote:
>>>> Added create_udp_tunnel_socket(), packet receive and transmit, and
>>>> other related common functions for UDP tunnels.
>>>>
>>>> Per net open UDP tunnel ports are tracked in this common layer to
>>>> prevent sharing of a single port with more than one UDP tunnel.
>>>>
>>>> Signed-off-by: Andy Zhou <azhou@...ira.com>
>>>> ---
>>>> include/net/udp_tunnel.h | 57 +++++++++-
>>>> net/ipv4/udp_tunnel.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-
>>>> 2 files changed, 312 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
>>>> index 3f34c65..b5e815a 100644
>>>> --- a/include/net/udp_tunnel.h
>>>> +++ b/include/net/udp_tunnel.h
>>>> @@ -1,7 +1,10 @@
>>>> #ifndef __NET_UDP_TUNNEL_H
>>>> #define __NET_UDP_TUNNEL_H
>>>>
>>>> -#define UDP_TUNNEL_TYPE_VXLAN 0x01
>>>> +#include <net/ip_tunnels.h>
>>>> +
>>>> +#define UDP_TUNNEL_TYPE_VXLAN 0x01
>>>> +#define UDP_TUNNEL_TYPE_GENEVE 0x02
>>>>
>>>> struct udp_port_cfg {
>>>> u8 family;
>>>> @@ -28,7 +31,59 @@ struct udp_port_cfg {
>>>> use_udp6_rx_checksums:1;
>>>> };
>>>>
>>>> +struct udp_tunnel_sock;
>>>> +
>>>> +typedef void (udp_tunnel_rcv_t)(struct udp_tunnel_sock *uts,
>>>> + struct sk_buff *skb, ...);
>>>> +
>>>> +typedef int (udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
>>>> +
>>>> +struct udp_tunnel_socket_cfg {
>>>> + u8 tunnel_type;
>>>> + struct udp_port_cfg port;
>>>> + udp_tunnel_rcv_t *rcv;
>>>> + udp_tunnel_encap_rcv_t *encap_rcv;
>>>
>>> Why do you need two receive functions or udp_tunnel_rcv_t?
>>>
>>>> + void *data;
>>>
>>> Similarly, why is this needed when we already have sk_user_data?
>>>
>>>> +};
>>>> +
>>>> +struct udp_tunnel_sock {
>>>> + u8 tunnel_type;
>>>> + struct hlist_node hlist;
>>>> + udp_tunnel_rcv_t *rcv;
>>>> + void *data;
>>>> + struct socket *sock;
>>>> +};
>>>> +
>>>> int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
>>>> struct socket **sockp);
>>>>
>>>> +struct udp_tunnel_sock *create_udp_tunnel_socket(struct net *net, size_t size,
>>>> + struct udp_tunnel_socket_cfg
>>>> + *socket_cfg);
>>>> +
>>>> +struct udp_tunnel_sock *udp_tunnel_find_sock(struct net *net, __be16 port);
>>>> +
>>>> +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
>>>> + struct sk_buff *skb, __be32 src, __be32 dst,
>>>> + __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
>>>> + __be16 dst_port, bool xnet);
>>>> +
>>>> +#if IS_ENABLED(CONFIG_IPV6)
>>>> +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
>>>> + struct sk_buff *skb, struct net_device *dev,
>>>> + struct in6_addr *saddr, struct in6_addr *daddr,
>>>> + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port);
>>>> +
>>>> +#endif
>>>> +
>>>> +void udp_tunnel_sock_release(struct udp_tunnel_sock *uts);
>>>> +void udp_tunnel_get_rx_port(struct net_device *dev);
>>>> +
>>>> +static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
>>>> + bool udp_csum)
>>>> +{
>>>> + int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
>>>> +
>>>> + return iptunnel_handle_offloads(skb, udp_csum, type);
>>>> +}
>>>> #endif
>>>> diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
>>>> index 61ec1a6..3c14b16 100644
>>>> --- a/net/ipv4/udp_tunnel.c
>>>> +++ b/net/ipv4/udp_tunnel.c
>>>> @@ -7,6 +7,23 @@
>>>> #include <net/udp.h>
>>>> #include <net/udp_tunnel.h>
>>>> #include <net/net_namespace.h>
>>>> +#include <net/netns/generic.h>
>>>> +#if IS_ENABLED(CONFIG_IPV6)
>>>> +#include <net/ipv6.h>
>>>> +#include <net/addrconf.h>
>>>> +#include <net/ip6_tunnel.h>
>>>> +#include <net/ip6_checksum.h>
>>>> +#endif
>>>> +
>>>> +#define PORT_HASH_BITS 8
>>>> +#define PORT_HASH_SIZE (1 << PORT_HASH_BITS)
>>>> +
>>>> +static int udp_tunnel_net_id;
>>>> +
>>>> +struct udp_tunnel_net {
>>>> + struct hlist_head sock_list[PORT_HASH_SIZE];
>>>> + spinlock_t sock_lock; /* Protecting the sock_list */
>>>> +};
>>>>
>>>> int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
>>>> struct socket **sockp)
>>>> @@ -82,7 +99,6 @@ int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
>>>> return -EPFNOSUPPORT;
>>>> }
>>>>
>>>> -
>>>> *sockp = sock;
>>>>
>>>> return 0;
>>>> @@ -97,4 +113,243 @@ error:
>>>> }
>>>> EXPORT_SYMBOL(udp_sock_create);
>>>>
>>>> +
>>>> +/* Socket hash table head */
>>>> +static inline struct hlist_head *uts_head(struct net *net, const __be16 port)
>>>> +{
>>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id);
>>>> +
>>>> + return &utn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
>>>> +}
>>>> +
>>>> +static int handle_offloads(struct sk_buff *skb)
>>>> +{
>>>> + if (skb_is_gso(skb)) {
>>>> + int err = skb_unclone(skb, GFP_ATOMIC);
>>>> +
>>>> + if (unlikely(err))
>>>> + return err;
>>>> + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
>>>> + } else {
>>>> + if (skb->ip_summed != CHECKSUM_PARTIAL)
>>>> + skb->ip_summed = CHECKSUM_NONE;
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +struct udp_tunnel_sock *create_udp_tunnel_socket(struct net *net, size_t size,
>>>> + struct udp_tunnel_socket_cfg
>>>> + *cfg)
>>>> +{
>>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id);
>>>> + struct udp_tunnel_sock *uts;
>>>> + struct socket *sock;
>>>> + struct sock *sk;
>>>> + const __be16 port = cfg->port.local_udp_port;
>>>> + const int ipv6 = (cfg->port.family == AF_INET6);
>>>> + int err;
>>>> +
>>>> + uts = kzalloc(size, GFP_KERNEL);
>>>> + if (!uts)
>>>> + return ERR_PTR(-ENOMEM);
>>>> +
>>>> + err = udp_sock_create(net, &cfg->port, &sock);
>>>> + if (err < 0) {
>>>> + kfree(uts);
>>>> + return NULL;
>>>> + }
>>>> +
>>>> + /* Disable multicast loopback */
>>>> + inet_sk(sock->sk)->mc_loop = 0;
>>>> +
>>>> + uts->sock = sock;
>>>> + sk = sock->sk;
>>>> + uts->rcv = cfg->rcv;
>>>> + uts->data = cfg->data;
>>>> + rcu_assign_sk_user_data(sock->sk, uts);
>>>> +
>>>> + spin_lock(&utn->sock_lock);
>>>> + hlist_add_head_rcu(&uts->hlist, uts_head(net, port));
>>>> + spin_unlock(&utn->sock_lock);
>>>> +
>>>> + udp_sk(sk)->encap_type = 1;
>>>> + udp_sk(sk)->encap_rcv = cfg->encap_rcv;
>>>> +
>>>> +#if IS_ENABLED(CONFIG_IPV6)
>>>> + if (ipv6)
>>>> + ipv6_stub->udpv6_encap_enable();
>>>> + else
>>>> +#endif
>>>> + udp_encap_enable();
>>>> +
>>>> + return uts;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(create_udp_tunnel_socket);
>>>> +
>>>> +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
>>>> + struct sk_buff *skb, __be32 src, __be32 dst,
>>>> + __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
>>>> + __be16 dst_port, bool xnet)
>>>> +{
>>>> + struct udphdr *uh;
>>>> +
>>>> + __skb_push(skb, sizeof(*uh));
>>>> + skb_reset_transport_header(skb);
>>>> + uh = udp_hdr(skb);
>>>> +
>>>> + uh->dest = dst_port;
>>>> + uh->source = src_port;
>>>> + uh->len = htons(skb->len);
>>>> +
>>>> + udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len);
>>>> +
>>>> + return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP,
>>>> + tos, ttl, df, xnet);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
>>>> +
>>>> +#if IS_ENABLED(CONFIG_IPV6)
>>>> +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
>>>> + struct sk_buff *skb, struct net_device *dev,
>>>> + struct in6_addr *saddr, struct in6_addr *daddr,
>>>> + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port)
>>>> +{
>>>> + struct udphdr *uh;
>>>> + struct ipv6hdr *ip6h;
>>>> + int err;
>>>> +
>>>> + __skb_push(skb, sizeof(*uh));
>>>> + skb_reset_transport_header(skb);
>>>> + uh = udp_hdr(skb);
>>>> +
>>>> + uh->dest = dst_port;
>>>> + uh->source = src_port;
>>>> +
>>>> + uh->len = htons(skb->len);
>>>> + uh->check = 0;
>>>> +
>>>> + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
>>>> + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
>>>> + | IPSKB_REROUTED);
>>>> + skb_dst_set(skb, dst);
>>>> +
>>>> + if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) {
>>>> + __wsum csum = skb_checksum(skb, 0, skb->len, 0);
>>>> +
>>>> + skb->ip_summed = CHECKSUM_UNNECESSARY;
>>>> + uh->check = csum_ipv6_magic(saddr, daddr, skb->len,
>>>> + IPPROTO_UDP, csum);
>>>> + if (uh->check == 0)
>>>> + uh->check = CSUM_MANGLED_0;
>>>> + } else {
>>>> + skb->ip_summed = CHECKSUM_PARTIAL;
>>>> + skb->csum_start = skb_transport_header(skb) - skb->head;
>>>> + skb->csum_offset = offsetof(struct udphdr, check);
>>>> + uh->check = ~csum_ipv6_magic(saddr, daddr,
>>>> + skb->len, IPPROTO_UDP, 0);
>>>> + }
>>>> +
>>>> + __skb_push(skb, sizeof(*ip6h));
>>>> + skb_reset_network_header(skb);
>>>> + ip6h = ipv6_hdr(skb);
>>>> + ip6h->version = 6;
>>>> + ip6h->priority = prio;
>>>> + ip6h->flow_lbl[0] = 0;
>>>> + ip6h->flow_lbl[1] = 0;
>>>> + ip6h->flow_lbl[2] = 0;
>>>> + ip6h->payload_len = htons(skb->len);
>>>> + ip6h->nexthdr = IPPROTO_UDP;
>>>> + ip6h->hop_limit = ttl;
>>>> + ip6h->daddr = *daddr;
>>>> + ip6h->saddr = *saddr;
>>>> +
>>>> + err = handle_offloads(skb);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> + ip6tunnel_xmit(skb, dev);
>>>> + return 0;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
>>>> +#endif
>>>> +
>>>> +struct udp_tunnel_sock *udp_tunnel_find_sock(struct net *net, __be16 port)
>>>> +{
>>>> + struct udp_tunnel_sock *uts;
>>>> +
>>>> + hlist_for_each_entry_rcu(uts, uts_head(net, port), hlist) {
>>>> + if (inet_sk(uts->sock->sk)->inet_sport == port)
>>>> + return uts;
>>>> + }
>>>> +
>>>> + return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(udp_tunnel_find_sock);
>>>> +
>>>> +void udp_tunnel_sock_release(struct udp_tunnel_sock *uts)
>>>> +{
>>>> + struct sock *sk = uts->sock->sk;
>>>> + struct net *net = sock_net(sk);
>>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id);
>>>> +
>>>> + spin_lock(&utn->sock_lock);
>>>> + hlist_del_rcu(&uts->hlist);
>>>> + rcu_assign_sk_user_data(uts->sock->sk, NULL);
>>>> + spin_unlock(&utn->sock_lock);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
>>>> +
>>>> +/* Calls the ndo_add_tunnel_port of the caller in order to
>>>> + * supply the listening VXLAN udp ports. Callers are expected
>>>> + * to implement the ndo_add_tunnle_port.
>>>> + */
>>>> +void udp_tunnel_get_rx_port(struct net_device *dev)
>>>> +{
>>>> + struct udp_tunnel_sock *uts;
>>>> + struct net *net = dev_net(dev);
>>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id);
>>>> + sa_family_t sa_family;
>>>> + __be16 port;
>>>> + unsigned int i;
>>>> +
>>>> + spin_lock(&utn->sock_lock);
>>>> + for (i = 0; i < PORT_HASH_SIZE; ++i) {
>>>> + hlist_for_each_entry_rcu(uts, &utn->sock_list[i], hlist) {
>>>> + port = inet_sk(uts->sock->sk)->inet_sport;
>>>> + sa_family = uts->sock->sk->sk_family;
>>>> + dev->netdev_ops->ndo_add_udp_tunnel_port(dev,
>>>> + sa_family, port, uts->tunnel_type);
>>>> + }
>>>> + }
>>>> + spin_unlock(&utn->sock_lock);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(udp_tunnel_get_rx_port);
>>>> +
>>>> +static int __net_init udp_tunnel_init_net(struct net *net)
>>>> +{
>>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id);
>>>> + unsigned int h;
>>>> +
>>>> + spin_lock_init(&utn->sock_lock);
>>>> +
>>>> + for (h = 0; h < PORT_HASH_SIZE; h++)
>>>> + INIT_HLIST_HEAD(&utn->sock_list[h]);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static struct pernet_operations udp_tunnel_net_ops = {
>>>> + .init = udp_tunnel_init_net,
>>>> + .exit = NULL,
>>>> + .id = &udp_tunnel_net_id,
>>>> + .size = sizeof(struct udp_tunnel_net),
>>>> +};
>>>> +
>>>> +static int __init udp_tunnel_init(void)
>>>> +{
>>>> + return register_pernet_subsys(&udp_tunnel_net_ops);
>>>> +}
>>>> +late_initcall(udp_tunnel_init);
>>>> +
>>>> MODULE_LICENSE("GPL");
>>>> --
>>>> 1.7.9.5
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>>> the body of a message to majordomo@...r.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists