[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20200414222749.GA22558@white>
Date: Wed, 15 Apr 2020 01:27:49 +0300
From: Leşe Doru Călin <lesedorucalin01@...il.com>
To: Eric Dumazet <eric.dumazet@...il.com>, netdev@...r.kernel.org
Cc: David Miller <davem@...emloft.net>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Leon Romanovsky <leon@...nel.org>
Subject: Re: [PATCH v3] net: UDP repair mode for retrieving the send queue of
corked UDP socket
On Tue, Apr 14, 2020 at 11:43:59AM -0700, Eric Dumazet wrote:
>
>
> On 4/14/20 2:09 AM, Lese Doru Calin wrote:
> > In this year's edition of GSoC, there is a project idea for CRIU to add
> > support for checkpoint/restore of cork-ed UDP sockets. But to add it, the
> > kernel API needs to be extended.
> >
> > This is what this patch does. It adds UDP "repair mode" for UDP sockets in
> > a similar approach to the TCP "repair mode", but only the send queue is
> > necessary to be retrieved. So the patch extends the recv and setsockopt
> > syscalls. Using UDP_REPAIR option in setsockopt, caller can set the socket
> > in repair mode. If it is setted, the recv/recvfrom/recvmsg will receive the
> > write queue and the destination of the data. As in the TCP mode, to change
> > the repair mode requires the CAP_NET_ADMIN capability and to receive data
> > the caller is obliged to use the MSG_PEEK flag.
> >
> > Signed-off-by: Lese Doru Calin <lesedorucalin01@...il.com>
> > ---
> > include/linux/udp.h | 3 ++-
> > include/uapi/linux/udp.h | 1 +
> > net/ipv4/udp.c | 55 ++++++++++++++++++++++++++++++++++++++++
> > net/ipv6/udp.c | 45 ++++++++++++++++++++++++++++++++
> > 4 files changed, 103 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/udp.h b/include/linux/udp.h
> > index aa84597bdc33..b22bd70118ce 100644
> > --- a/include/linux/udp.h
> > +++ b/include/linux/udp.h
> > @@ -51,7 +51,8 @@ struct udp_sock {
> > * different encapsulation layer set
> > * this
> > */
> > - gro_enabled:1; /* Can accept GRO packets */
> > + gro_enabled:1, /* Can accept GRO packets */
> > + repair:1;/* Receive the send queue */
> > /*
> > * Following member retains the information to create a UDP header
> > * when the socket is uncorked.
> > diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
> > index 4828794efcf8..2fe78329d6da 100644
> > --- a/include/uapi/linux/udp.h
> > +++ b/include/uapi/linux/udp.h
> > @@ -29,6 +29,7 @@ struct udphdr {
> >
> > /* UDP socket options */
> > #define UDP_CORK 1 /* Never send partially complete segments */
> > +#define UDP_REPAIR 19 /* Receive the send queue */
> > #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
> > #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
> > #define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 32564b350823..dc9d15d564d6 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1720,6 +1720,28 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
> > }
> > EXPORT_SYMBOL(__skb_recv_udp);
> >
> > +static int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
> > +{
> > + int copy, copied = 0, err = 0;
> > + struct sk_buff *skb;
> > +
> > + skb_queue_walk(&sk->sk_write_queue, skb) {
> > + copy = len - copied;
> > + if (copy > skb->len - off)
> > + copy = skb->len - off;
> > +
> > + err = skb_copy_datagram_msg(skb, off, msg, copy);
> > + if (err)
> > + break;
> > +
> > + copied += copy;
> > +
> > + if (len <= copied)
> > + break;
> > + }
> > + return err ?: copied;
> > +}
> > +
> > /*
> > * This should be easy, if there is something there we
> > * return it, otherwise we block.
> > @@ -1729,8 +1751,10 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
> > int flags, int *addr_len)
> > {
> > struct inet_sock *inet = inet_sk(sk);
> > + struct udp_sock *up = udp_sk(sk);
> > DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
> > struct sk_buff *skb;
> > + struct flowi4 *fl4;
> > unsigned int ulen, copied;
> > int off, err, peeking = flags & MSG_PEEK;
> > int is_udplite = IS_UDPLITE(sk);
> > @@ -1739,6 +1763,12 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
> > if (flags & MSG_ERRQUEUE)
> > return ip_recv_error(sk, msg, len, addr_len);
> >
> > + if (unlikely(up->repair)) {
>
> Socket lock is not held, disaster seems possible :/
>
I missed that. Sorry.
> > + if (!peeking)
> > + return -EPERM;
> > + goto recv_sndq;
> > + }
> > +
>
>
> Really, I would rather stop adding code in fast path for REPAIR stuff.
>
> A new setsockopt() would be much better I think.
>
I could implement it as a option for getsockopt syscall, with a pointer to a
buffer or a user_msghdr as a parameter, so that other syscalls are not affected.
> > try_again:
> > off = sk_peek_offset(sk, flags);
> > skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
> > @@ -1832,6 +1862,18 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
> > cond_resched();
> > msg->msg_flags &= ~MSG_TRUNC;
> > goto try_again;
> > +
> > +recv_sndq:
> > + off = sizeof(struct iphdr) + sizeof(struct udphdr);
> > + if (sin) {
> > + fl4 = &inet->cork.fl.u.ip4;
> > + sin->sin_family = AF_INET;
> > + sin->sin_port = fl4->fl4_dport;
> > + sin->sin_addr.s_addr = fl4->daddr;
> > + memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
> > + *addr_len = sizeof(*sin);
> > + }
> > + return udp_peek_sndq(sk, msg, off, len);
> > }
> >
> > int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> > @@ -2557,6 +2599,15 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
> > }
> > break;
> >
> > + case UDP_REPAIR:
> > + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
> > + err = -EPERM;
> > + else if (val != 0)
> > + up->repair = 1;
> > + else
> > + up->repair = 0;
> > + break;
> > +
> > case UDP_ENCAP:
> > switch (val) {
> > case 0:
> > @@ -2678,6 +2729,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
> > val = up->corkflag;
> > break;
> >
> > + case UDP_REPAIR:
> > + val = up->repair;
> > + break;
> > +
> > case UDP_ENCAP:
> > val = up->encap_type;
> > break;
> > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> > index 7d4151747340..ec653f9fce2d 100644
> > --- a/net/ipv6/udp.c
> > +++ b/net/ipv6/udp.c
> > @@ -250,6 +250,28 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
> > EXPORT_SYMBOL_GPL(udp6_lib_lookup);
> > #endif
> >
> > +static int udp6_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
> > +{
> > + int copy, copied = 0, err = 0;
> > + struct sk_buff *skb;
> > +
> > + skb_queue_walk(&sk->sk_write_queue, skb) {
> > + copy = len - copied;
> > + if (copy > skb->len - off)
> > + copy = skb->len - off;
> > +
> > + err = skb_copy_datagram_msg(skb, off, msg, copy);
> > + if (err)
> > + break;
> > +
> > + copied += copy;
> > +
> > + if (len <= copied)
> > + break;
> > + }
> > + return err ?: copied;
> > +}
> > +
> > /* do not use the scratch area len for jumbogram: their length execeeds the
> > * scratch area space; note that the IP6CB flags is still in the first
> > * cacheline, so checking for jumbograms is cheap
> > @@ -269,7 +291,9 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > {
> > struct ipv6_pinfo *np = inet6_sk(sk);
> > struct inet_sock *inet = inet_sk(sk);
> > + struct udp_sock *up = udp_sk(sk);
> > struct sk_buff *skb;
> > + struct flowi6 *fl6;
> > unsigned int ulen, copied;
> > int off, err, peeking = flags & MSG_PEEK;
> > int is_udplite = IS_UDPLITE(sk);
> > @@ -283,6 +307,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > if (np->rxpmtu && np->rxopt.bits.rxpmtu)
> > return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
> >
> > + if (unlikely(up->repair)) {
> > + if (!peeking)
> > + return -EPERM;
> > + goto recv_sndq;
> > + }
> > +
> > try_again:
> > off = sk_peek_offset(sk, flags);
> > skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
> > @@ -394,6 +424,21 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > cond_resched();
> > msg->msg_flags &= ~MSG_TRUNC;
> > goto try_again;
> > +
> > +recv_sndq:
> > + off = sizeof(struct ipv6hdr) + sizeof(struct udphdr);
> > + if (msg->msg_name) {
> > + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
> > +
> > + fl6 = &inet->cork.fl.u.ip6;
> > + sin6->sin6_family = AF_INET6;
> > + sin6->sin6_port = fl6->fl6_dport;
> > + sin6->sin6_flowinfo = 0;
> > + sin6->sin6_addr = fl6->daddr;
> > + sin6->sin6_scope_id = fl6->flowi6_oif;
> > + *addr_len = sizeof(*sin6);
> > + }
> > + return udp6_peek_sndq(sk, msg, off, len);
> > }
> >
> > DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
> >
Powered by blists - more mailing lists