[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200416132242.GA2586@white>
Date: Thu, 16 Apr 2020 16:22:42 +0300
From: Leşe Doru Călin <lesedorucalin01@...il.com>
To: Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org
Cc: David Miller <davem@...emloft.net>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>
Subject: [PATCH v4] net: UDP repair mode for retrieving the send queue of
corked UDP socket
In this year's edition of GSoC, there is a project idea for CRIU to add
support for checkpoint/restore of cork-ed UDP sockets. But to add it, the
kernel API needs to be extended.
This is what this patch does. It adds UDP "repair mode" for UDP sockets in
a similar approach to the TCP "repair mode", but only the send queue is
necessary to be retrieved. So the patch extends the recv and setsockopt
syscalls. Using UDP_REPAIR option in setsockopt, caller can set the socket
in repair mode. If it is setted, the recv/recvfrom/recvmsg will receive the
write queue and the destination of the data. As in the TCP mode, to change
the repair mode requires the CAP_NET_ADMIN capability and to receive data
the caller is obliged to use the MSG_PEEK flag.
Signed-off-by: Lese Doru Calin <lesedorucalin01@...il.com>
---
include/linux/udp.h | 3 +
include/net/udp.h | 3 +
include/uapi/linux/udp.h | 1
net/ipv4/udp.c | 85 +++++++++++++++++++++++++++++++++++++++--------
net/ipv6/udp.c | 64 ++++++++++++++++++++++++-----------
5 files changed, 122 insertions(+), 34 deletions(-)
diff --git a/include/linux/udp.h b/include/linux/udp.h
index aa84597bdc33..b22bd70118ce 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -51,7 +51,8 @@ struct udp_sock {
* different encapsulation layer set
* this
*/
- gro_enabled:1; /* Can accept GRO packets */
+ gro_enabled:1, /* Can accept GRO packets */
+ repair:1;/* Receive the send queue */
/*
* Following member retains the information to create a UDP header
* when the socket is uncorked.
diff --git a/include/net/udp.h b/include/net/udp.h
index a8fa6c0c6ded..f7a7fab0712f 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -282,8 +282,11 @@ int udp_get_port(struct sock *sk, unsigned short snum,
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 4828794efcf8..2fe78329d6da 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -29,6 +29,7 @@ struct udphdr {
/* UDP socket options */
#define UDP_CORK 1 /* Never send partially complete segments */
+#define UDP_REPAIR 19 /* Receive the send queue */
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32564b350823..58b59db42ca3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1720,6 +1720,48 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
}
EXPORT_SYMBOL(__skb_recv_udp);
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
+{
+ int copy, copied = 0, err = 0;
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ copy = len - copied;
+ if (copy > skb->len - off)
+ copy = skb->len - off;
+
+ err = skb_copy_datagram_msg(skb, off, msg, copy);
+ if (err)
+ break;
+
+ copied += copy;
+ if (len <= copied)
+ break;
+ }
+ release_sock(sk);
+ return err ?: copied;
+}
+EXPORT_SYMBOL(udp_peek_sndq);
+
+static void udp_set_source_addr(struct sock *sk, struct msghdr *msg,
+ int *addr_len, u32 addr, u16 port)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = port;
+ sin->sin_addr.s_addr = addr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin);
+ }
+}
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -1729,8 +1771,9 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct udp_sock *up = udp_sk(sk);
struct sk_buff *skb;
+ struct flowi4 *fl4;
unsigned int ulen, copied;
int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
@@ -1739,6 +1782,17 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);
+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ off = sizeof(struct iphdr) + sizeof(struct udphdr);
+ fl4 = &inet->cork.fl.u.ip4;
+ udp_set_source_addr(sk, msg, addr_len, fl4->daddr,
+ fl4->fl4_dport);
+ return udp_peek_sndq(sk, msg, off, len);
+ }
+
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -1793,19 +1847,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
-
- /* Copy the address. */
- if (sin) {
- sin->sin_family = AF_INET;
- sin->sin_port = udp_hdr(skb)->source;
- sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
-
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin);
- }
+ udp_set_source_addr(sk, msg, addr_len, ip_hdr(skb)->saddr,
+ udp_hdr(skb)->source);
if (udp_sk(sk)->gro_enabled)
udp_cmsg_recv(msg, sk, skb);
@@ -1833,6 +1876,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
+EXPORT_SYMBOL(udp_recvmsg);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -2557,6 +2601,15 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
}
break;
+ case UDP_REPAIR:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ err = -EPERM;
+ else if (val != 0)
+ up->repair = 1;
+ else
+ up->repair = 0;
+ break;
+
case UDP_ENCAP:
switch (val) {
case 0:
@@ -2678,6 +2731,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->corkflag;
break;
+ case UDP_REPAIR:
+ val = up->repair;
+ break;
+
case UDP_ENCAP:
val = up->encap_type;
break;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7d4151747340..be2a668c29c5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -250,6 +250,24 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
+static void udpv6_set_source_addr(struct sock *sk, struct msghdr *msg,
+ int *addr_len, struct in6_addr *addr,
+ u16 port, u32 scope_id)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = port;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_addr = *addr;
+ sin6->sin6_scope_id = scope_id;
+ *addr_len = sizeof(*sin6);
+
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin6);
+}
+
/* do not use the scratch area len for jumbogram: their length execeeds the
* scratch area space; note that the IP6CB flags is still in the first
* cacheline, so checking for jumbograms is cheap
@@ -269,8 +287,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ struct in6_addr saddr;
struct sk_buff *skb;
- unsigned int ulen, copied;
+ struct flowi6 *fl6;
+ unsigned int ulen, scpid, copied;
int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
struct udp_mib __percpu *mib;
@@ -283,6 +304,23 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ if (up->pending == AF_INET)
+ return udp_recvmsg(sk, msg, len, noblock,
+ flags, addr_len);
+
+ off = sizeof(struct ipv6hdr) + sizeof(struct udphdr);
+ if (msg->msg_name) {
+ fl6 = &inet->cork.fl.u.ip6;
+ udpv6_set_source_addr(sk, msg, addr_len, &fl6->daddr,
+ fl6->fl6_dport, fl6->flowi6_oif);
+ }
+ return udp_peek_sndq(sk, msg, off, len);
+ }
+
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -336,28 +374,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
sock_recv_ts_and_drops(msg, sk, skb);
- /* Copy the address. */
if (msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = udp_hdr(skb)->source;
- sin6->sin6_flowinfo = 0;
-
if (is_udp4) {
- ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
- &sin6->sin6_addr);
- sin6->sin6_scope_id = 0;
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &saddr);
+ scpid = 0;
} else {
- sin6->sin6_addr = ipv6_hdr(skb)->saddr;
- sin6->sin6_scope_id =
- ipv6_iface_scope_id(&sin6->sin6_addr,
- inet6_iif(skb));
+ saddr = ipv6_hdr(skb)->saddr;
+ scpid = ipv6_iface_scope_id(&saddr, inet6_iif(skb));
}
- *addr_len = sizeof(*sin6);
-
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin6);
+ udpv6_set_source_addr(sk, msg, addr_len, &saddr,
+ udp_hdr(skb)->source, scpid);
}
if (udp_sk(sk)->gro_enabled)
Powered by blists - more mailing lists