[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YqarphOzFTnQRq29@d3>
Date: Mon, 13 Jun 2022 12:14:46 +0900
From: Benjamin Poirier <bpoirier@...dia.com>
To: Mike Manning <mvrmanning@...il.com>
Cc: Netdev <netdev@...r.kernel.org>, David Ahern <dsahern@...il.com>,
Saikrishna Arcot <sarcot@...rosoft.com>,
Craig Gallek <kraig@...gle.com>
Subject: Re: [PATCH] net: prefer socket bound to interface when not in VRF
On 2021-10-05 14:03 +0100, Mike Manning wrote:
[...]
>
> Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF")
> Fixes: e78190581aff ("net: ensure unbound stream socket to be chosen when not in a VRF")
> Signed-off-by: Mike Manning <mmanning@...tta.att-mail.com>
> ---
>
> diff nettest-baseline-9e9fb7655ed5.txt nettest-fix.txt
> 955,956c955,956
> < TEST: IPv4 TCP connection over VRF with SNAT [FAIL]
> < TEST: IPv6 TCP connection over VRF with SNAT [FAIL]
> ---
> > TEST: IPv4 TCP connection over VRF with SNAT [ OK ]
> > TEST: IPv6 TCP connection over VRF with SNAT [ OK ]
> 958,959c958,959
> < Tests passed: 713
> < Tests failed: 5
> ---
> > Tests passed: 715
> > Tests failed: 3
>
> ---
> net/ipv4/inet_hashtables.c | 4 +++-
> net/ipv4/udp.c | 3 ++-
> net/ipv6/inet6_hashtables.c | 2 +-
> net/ipv6/udp.c | 3 ++-
> 4 files changed, 8 insertions(+), 4 deletions(-)
>
Hi Mike,
I was looking at this commit, 8d6c414cd2fb ("net: prefer socket bound to
interface when not in VRF"), and I get the feeling that it is only
partially effective. It works with UDP connected sockets but it doesn't
work for TCP and UDP unconnected sockets.
The compute_score() functions are a bit misleading. Because of the
reuseport shortcut in their callers (inet_lhash2_lookup() and the like),
the first socket with score > 0 may be chosen, not necessarily the
socket with highest score. In order to prefer certain sockets, I think
an approach like commit d894ba18d4e4 ("soreuseport: fix ordering for
mixed v4/v6 sockets") would be needed. What do you think?
Extra info:
1) fcnal-test.sh results
I tried to reproduce the fcnal-test.sh test results quoted above but in
my case the test cases already pass at 8d6c414cd2fb^ and 9e9fb7655ed5.
Moreover I believe those test cases don't have multiple listening
sockets. So that just added to my confusion.
Running 9e9fb7655ed5,
root@...d:/src/linux/tools/testing/selftests/net# ./fcnal-test.sh -t use_cases
[...]
#################################################################
SNAT on VRF
TEST: IPv4 TCP connection over VRF with SNAT [ OK ]
TEST: IPv6 TCP connection over VRF with SNAT [ OK ]
Tests passed: 16
Tests failed: 0
2) reuseport_bindtodevice test
I wrote a selftest based on
tools/testing/selftests/net/reuseport_addr_any.c It tests that listening
sockets that have SO_BINDTODEVICE set are preferred over ones that do
not. All of the sockets have SO_REUSEPORT set. I ran it over a few
relevant revisions:
IPv4 IPv6
HEAD TCP UDP unconn UDP conn TCP UDP unconn UDP conn
6a5ef90c58da^ ✔ ✔ ✔ ✔ ✔ ✔
6a5ef90c58da ✔ ✘ ✔ ✔ ✘ ✔
fd1914b2901b ✘ ✘ ✔ ✘ ✘ ✔
7e225619e8af ✘ ✘ ✘ ✘ ✘ ✘
8d6c414cd2fb ✘ ✘ ✔ ✘ ✘ ✔
✔ pass
✘ fail
reuseport_bindtodevice.c:
// SPDX-License-Identifier: GPL-2.0
/* Test that listening sockets that have SO_BINDTODEVICE set are preferred
* over ones that do not. All of the sockets have SO_REUSEPORT set.
*/
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <errno.h>
#include <error.h>
#include <linux/in.h>
#include <linux/unistd.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>
static const int SEND_PORT = 8888;
static const int RECV_PORT = 8889;
static const char *get_family_name(int domain)
{
if (domain == AF_INET)
return "IPv4";
else if (domain == AF_INET6)
return "IPv6";
else
error(1, 0, "Unknown address family \"%d\"", domain);
return "";
}
static void build_rcv_fd(int domain, int type, int *rcv_fds, int count,
const char *ifname, bool do_connect)
{
struct sockaddr_storage saddr, daddr;
int opt, i;
if (domain == AF_INET) {
struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr,
*daddr4 = (struct sockaddr_in *)&daddr;
saddr4->sin_family = AF_INET;
saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
saddr4->sin_port = htons(RECV_PORT);
daddr4->sin_family = AF_INET;
daddr4->sin_addr.s_addr = htonl(INADDR_ANY);
daddr4->sin_port = htons(SEND_PORT);
} else if (domain == AF_INET6) {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr,
*daddr6 = (struct sockaddr_in6 *)&daddr;
saddr6->sin6_family = AF_INET6;
saddr6->sin6_addr = in6addr_any;
saddr6->sin6_port = htons(RECV_PORT);
daddr6->sin6_family = AF_INET6;
daddr6->sin6_addr = in6addr_any;
daddr6->sin6_port = htons(SEND_PORT);
} else {
error(1, 0, "Unsupported family %d", domain);
}
for (i = 0; i < count; ++i) {
rcv_fds[i] = socket(domain, type, 0);
if (rcv_fds[i] < 0)
error(1, errno, "failed to create receive socket");
opt = 1;
if (setsockopt(rcv_fds[i], SOL_SOCKET, SO_REUSEPORT, &opt,
sizeof(opt)))
error(1, errno, "failed to set SO_REUSEPORT");
if (ifname && setsockopt(rcv_fds[i], SOL_SOCKET,
SO_BINDTODEVICE, ifname,
strlen(ifname)))
error(1, errno, "failed to set SO_BINDTODEVICE");
if (bind(rcv_fds[i], (struct sockaddr *)&saddr, sizeof(saddr)))
error(1, errno, "failed to bind receive socket");
if (do_connect &&
connect(rcv_fds[i], (struct sockaddr *)&daddr,
sizeof(daddr)))
error(1, errno, "failed to connect receive socket");
if (type == SOCK_STREAM && listen(rcv_fds[i], 10))
error(1, errno, "failed to listen on receive socket");
}
}
static int connect_and_send(int domain, int type)
{
struct sockaddr_storage saddr, daddr;
int fd;
if (domain == AF_INET) {
struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr,
*daddr4 = (struct sockaddr_in *)&daddr;
saddr4->sin_family = AF_INET;
saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
saddr4->sin_port = htons(SEND_PORT);
daddr4->sin_family = AF_INET;
daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
daddr4->sin_port = htons(RECV_PORT);
} else if (domain == AF_INET6) {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr,
*daddr6 = (struct sockaddr_in6 *)&daddr;
saddr6->sin6_family = AF_INET6;
saddr6->sin6_addr = in6addr_any;
saddr6->sin6_port = htons(SEND_PORT);
daddr6->sin6_family = AF_INET6;
daddr6->sin6_addr = in6addr_loopback;
daddr6->sin6_port = htons(RECV_PORT);
} else {
error(1, 0, "Unsupported family %d", domain);
}
fd = socket(domain, type, 0);
if (fd < 0)
error(1, errno, "failed to create send socket");
if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
error(1, errno, "failed to bind send socket");
if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
error(1, errno, "failed to connect send socket");
if (send(fd, "a", 1, 0) < 0)
error(1, errno, "failed to send message");
return fd;
}
static int receive_once(int epfd, int type)
{
struct epoll_event ev;
int i, fd;
char buf[8];
i = epoll_wait(epfd, &ev, 1, 3);
if (i < 0)
error(1, errno, "epoll_wait failed");
else if (i == 0)
error(1, errno, "no socket is ready");
if (type == SOCK_STREAM) {
fd = accept(ev.data.fd, NULL, NULL);
if (fd < 0)
error(1, errno, "failed to accept");
i = recv(fd, buf, sizeof(buf), 0);
close(fd);
} else {
i = recv(ev.data.fd, buf, sizeof(buf), 0);
}
if (i < 0)
error(1, errno, "failed to recv");
return ev.data.fd;
}
static int test(int *rcv_fds, int count, int domain, int type, int fd)
{
int epfd, i, send_fd, recv_fd;
struct epoll_event ev;
epfd = epoll_create(1);
if (epfd < 0)
error(1, errno, "failed to create epoll");
ev.events = EPOLLIN;
for (i = 0; i < count; ++i) {
ev.data.fd = rcv_fds[i];
if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fds[i], &ev))
error(1, errno, "failed to register sock epoll");
}
send_fd = connect_and_send(domain, type);
recv_fd = receive_once(epfd, type);
close(send_fd);
close(epfd);
return recv_fd == fd;
}
static int run_one_test(int domain, int type, bool do_connect)
{
/* Below we test that a socket listening with SO_BINDTODEVICE set is
* always selected in preference over a socket listening without. Bugs
* where this is not the case often result in sockets created first or
* last to get picked. So below we make sure that there are always
* sockets with SO_BINDTODEVICE created before and after a specific
* socket is created.
*/
int rcv_fds[10], i, result;
build_rcv_fd(AF_INET, type, rcv_fds, 2, NULL, do_connect);
build_rcv_fd(AF_INET6, type, rcv_fds + 2, 2, NULL, do_connect);
build_rcv_fd(domain, type, rcv_fds + 4, 1, "lo", do_connect);
build_rcv_fd(AF_INET, type, rcv_fds + 5, 2, NULL, do_connect);
build_rcv_fd(AF_INET6, type, rcv_fds + 7, 2, NULL, do_connect);
result = test(rcv_fds, 9, domain, type, rcv_fds[4]);
for (i = 0; i < 9; ++i)
close(rcv_fds[i]);
if (result)
fprintf(stderr, "pass\n");
else
fprintf(stderr, "fail\n");
return result;
}
static int test_family(int domain)
{
int result = 1;
fprintf(stderr, "%s TCP ... ", get_family_name(domain));
result &= run_one_test(domain, SOCK_STREAM, false);
fprintf(stderr, "%s UDP unconnected ... ", get_family_name(domain));
result &= run_one_test(domain, SOCK_DGRAM, false);
fprintf(stderr, "%s UDP connected ... ", get_family_name(domain));
result &= run_one_test(domain, SOCK_DGRAM, true);
return result;
}
int main(void)
{
int result = 1;
result &= test_family(AF_INET);
result &= test_family(AF_INET6);
if (result) {
fprintf(stderr, "SUCCESS\n");
return 0;
}
fprintf(stderr, "FAIL\n");
return 1;
}
Powered by blists - more mailing lists