[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b0f99499-fb6a-b9ec-7bd3-f535f11a885d@I-love.SAKURA.ne.jp>
Date: Fri, 22 Apr 2022 23:40:59 +0900
From: Tetsuo Handa <penguin-kernel@...ove.SAKURA.ne.jp>
To: Santosh Shilimkar <santosh.shilimkar@...cle.com>,
OFED mailing list <linux-rdma@...r.kernel.org>
Cc: syzbot <syzbot+694120e1002c117747ed@...kaller.appspotmail.com>,
andrii@...nel.org, andriin@...com, ast@...nel.org,
daniel@...earbox.net, davem@...emloft.net, dsahern@...nel.org,
edumazet@...gle.com, john.fastabend@...il.com, kafai@...com,
kpsingh@...nel.org, kuba@...nel.org, kuznet@....inr.ac.ru,
netdev@...r.kernel.org, songliubraving@...com,
syzkaller-bugs@...glegroups.com, tpa@...hospital.com, yhs@...com,
yoshfuji@...ux-ipv6.org, bpf@...r.kernel.org
Subject: Re: [syzbot] KASAN: use-after-free Read in tcp_retransmit_timer (5)
Hello, RDS developers.
I was thinking that BPF program is relevant with the TCP/IPv6 socket triggering
use-after-free access. But disassembling syzkaller-generated BPF program concluded
that what "char program[2053]" is doing is not important
( https://lkml.kernel.org/r/d21e278f-a3ff-8603-f6ba-b51a8cddafa8@I-love.SAKURA.ne.jp ).
Then, I realized that TCP/IPv6 port 16385 (which the reproducer is accessing) is
used by kernel RDS server, which can explain
"It seems that a socket with sk->sk_net_refcnt=0 is created by unshare(CLONE_NEWNET)"
at https://lkml.kernel.org/r/fa445f0e-32b7-5e0d-9326-94bc5adba4c1@I-love.SAKURA.ne.jp
because the kernel RDS server starts during boot procedure.
------------------------------------------------------------
root@...z:~# unshare -n netstat -tanpe
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State User Inode PID/Program name
tcp6 0 0 :::16385 :::* LISTEN 0 19627 -
------------------------------------------------------------
With the debug printk() patch shown below,
------------------------------------------------------------
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27..20b3c42b4140 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -429,7 +429,8 @@ static void net_free(struct net *net)
{
if (refcount_dec_and_test(&net->passive)) {
kfree(rcu_access_pointer(net->gen));
- kmem_cache_free(net_cachep, net);
+ memset(net, POISON_FREE, sizeof(struct net));
+ //kmem_cache_free(net_cachep, net);
}
}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 09cadd556d1e..5792fe3df8ac 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -146,10 +146,9 @@ int rds_tcp_accept_one(struct socket *sock)
my_addr = &saddr;
peer_addr = &daddr;
#endif
- rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
- sock->sk->sk_family,
- my_addr, ntohs(inet->inet_sport),
- peer_addr, ntohs(inet->inet_dport));
+ pr_info("accepted family %d tcp %pI6c:%u -> %pI6c:%u refcnt=%d sock_net=%px init_net=%px\n",
+ sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr,
+ ntohs(inet->inet_dport), sock->sk->sk_net_refcnt, sock_net(sock->sk), &init_net);
#if IS_ENABLED(CONFIG_IPV6)
/* sk_bound_dev_if is not set if the peer address is not link local
------------------------------------------------------------
I get
accepted family 10 tcp ::ffff:127.0.0.1:16385 -> ::ffff:127.0.0.1:33086 refcnt=0 sock_net=ffffffff860d89c0 init_net=ffffffff860d89c0
if I do
# echo > /dev/tcp/127.0.0.1/16385
from init_net namespace, and I get
accepted family 10 tcp ::ffff:127.0.0.1:16385 -> ::ffff:127.0.0.1:33088 refcnt=0 sock_net=ffff88810a208000 init_net=ffffffff860d89c0
if I do
# echo > /dev/tcp/127.0.0.1/16385
from non-init_net namespace. Note that sock->sk->sk_net_refcnt is 0 in both cases.
Like commit 2303f994b3e18709 ("mptcp: Associate MPTCP context with TCP socket") says
/* kernel sockets do not by default acquire net ref, but TCP timer
* needs it.
*/
, I came to feel that e.g. rds_tcp_accept_one() is accessing sock_net(sock->sk) on
accepted sockets with sock->sk->sk_net_refcnt=0 (because the listening socket was
created by kernel) is causing this problem. Why not rds kernel server does
sock->sk->sk_net_refcnt = 1;
get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL);
sock_inuse_add(net, 1);
on accepted sockets like mptcp_subflow_create_socket() does?
For your testing, below is the latest reproducer.
You can try this reproducer with keep-memory-poisoned patch shown above.
------------------------------------------------------------
// https://syzkaller.appspot.com/bug?id=8f0e04b2beffcd42f044d46879cc224f6eb71a99
// autogenerated by syzkaller (https://github.com/google/syzkaller)
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <net/if.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#ifndef MSG_PROBE
#define MSG_PROBE 0x10
#endif
struct nlmsg {
char* pos;
int nesting;
struct nlattr* nested[8];
char buf[4096];
};
static void netlink_init(struct nlmsg* nlmsg, int typ, int flags,
const void* data, int size)
{
memset(nlmsg, 0, sizeof(*nlmsg));
struct nlmsghdr* hdr = (struct nlmsghdr*)nlmsg->buf;
hdr->nlmsg_type = typ;
hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
memcpy(hdr + 1, data, size);
nlmsg->pos = (char*)(hdr + 1) + NLMSG_ALIGN(size);
}
static void netlink_attr(struct nlmsg* nlmsg, int typ, const void* data,
int size)
{
struct nlattr* attr = (struct nlattr*)nlmsg->pos;
attr->nla_len = sizeof(*attr) + size;
attr->nla_type = typ;
if (size > 0)
memcpy(attr + 1, data, size);
nlmsg->pos += NLMSG_ALIGN(attr->nla_len);
}
static int netlink_send_ext(struct nlmsg* nlmsg, int sock, uint16_t reply_type,
int* reply_len, bool dofail)
{
if (nlmsg->pos > nlmsg->buf + sizeof(nlmsg->buf) || nlmsg->nesting)
exit(1);
struct nlmsghdr* hdr = (struct nlmsghdr*)nlmsg->buf;
hdr->nlmsg_len = nlmsg->pos - nlmsg->buf;
struct sockaddr_nl addr;
memset(&addr, 0, sizeof(addr));
addr.nl_family = AF_NETLINK;
ssize_t n = sendto(sock, nlmsg->buf, hdr->nlmsg_len, 0,
(struct sockaddr*)&addr, sizeof(addr));
if (n != (ssize_t)hdr->nlmsg_len) {
if (dofail)
exit(1);
return -1;
}
n = recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0);
if (reply_len)
*reply_len = 0;
if (n < 0) {
if (dofail)
exit(1);
return -1;
}
if (n < (ssize_t)sizeof(struct nlmsghdr)) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
if (hdr->nlmsg_type == NLMSG_DONE)
return 0;
if (reply_len && hdr->nlmsg_type == reply_type) {
*reply_len = n;
return 0;
}
if (n < (ssize_t)(sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr))) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
if (hdr->nlmsg_type != NLMSG_ERROR) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
errno = -((struct nlmsgerr*)(hdr + 1))->error;
return -errno;
}
static int netlink_send(struct nlmsg* nlmsg, int sock)
{
return netlink_send_ext(nlmsg, sock, 0, NULL, true);
}
static void netlink_device_change(int sock, const char* name, const void* mac, int macsize)
{
struct nlmsg nlmsg;
struct ifinfomsg hdr;
memset(&hdr, 0, sizeof(hdr));
hdr.ifi_flags = hdr.ifi_change = IFF_UP;
hdr.ifi_index = if_nametoindex(name);
netlink_init(&nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr));
netlink_attr(&nlmsg, IFLA_ADDRESS, mac, macsize);
netlink_send(&nlmsg, sock);
}
static void netlink_add_addr(int sock, const char* dev, const void* addr, int addrsize)
{
struct nlmsg nlmsg;
struct ifaddrmsg hdr;
memset(&hdr, 0, sizeof(hdr));
hdr.ifa_family = addrsize == 4 ? AF_INET : AF_INET6;
hdr.ifa_prefixlen = addrsize == 4 ? 24 : 120;
hdr.ifa_scope = RT_SCOPE_UNIVERSE;
hdr.ifa_index = if_nametoindex(dev);
netlink_init(&nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr,
sizeof(hdr));
netlink_attr(&nlmsg, IFA_LOCAL, addr, addrsize);
netlink_attr(&nlmsg, IFA_ADDRESS, addr, addrsize);
netlink_send(&nlmsg, sock);
}
static void netlink_add_addr4(int sock, const char* dev, const char* addr)
{
struct in_addr in_addr;
inet_pton(AF_INET, addr, &in_addr);
netlink_add_addr(sock, dev, &in_addr, sizeof(in_addr));
}
static void netlink_add_addr6(int sock, const char* dev, const char* addr)
{
struct in6_addr in6_addr;
inet_pton(AF_INET6, addr, &in6_addr);
netlink_add_addr(sock, dev, &in6_addr, sizeof(in6_addr));
}
static void initialize_netdevices(void)
{
int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
uint64_t macaddr = 0x00aaaaaaaaaa;
if (fd == EOF)
exit(1);
netlink_add_addr4(fd, "lo", "127.0.0.1");
netlink_add_addr6(fd, "lo", "::1");
netlink_device_change(fd, "lo", &macaddr, ETH_ALEN);
close(fd);
}
#ifndef __NR_bpf
#define __NR_bpf 321
#endif
static void execute_one(void)
{
const union bpf_attr attr = {
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
.insn_cnt = 2,
.insns = (unsigned long long) "\xb7\x00\x00\x00\x00\x00\x00\x00\x95\x00\x00\x00\x00\x00\x00\x00",
.license = (unsigned long long) "GPL",
};
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(0x4001), /* where kernel RDS TCPv6 socket is listening */
.sin_addr.s_addr = inet_addr("127.0.0.1")
};
const struct msghdr msg = {
.msg_name = &addr,
.msg_namelen = sizeof(addr),
};
const int bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, 72);
const int sock_fd = socket(PF_INET, SOCK_STREAM, 0);
alarm(3);
while (1) {
sendmsg(sock_fd, &msg, MSG_OOB | MSG_PROBE | MSG_CONFIRM | MSG_FASTOPEN);
setsockopt(sock_fd, SOL_SOCKET, SO_ATTACH_BPF, &bpf_fd, sizeof(bpf_fd));
}
}
int main(int argc, char *argv[])
{
if (unshare(CLONE_NEWNET))
return 1;
initialize_netdevices();
execute_one();
return 0;
}
------------------------------------------------------------
Powered by blists - more mailing lists