[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20201117094023.3685-8-kuniyu@amazon.co.jp>
Date: Tue, 17 Nov 2020 18:40:22 +0900
From: Kuniyuki Iwashima <kuniyu@...zon.co.jp>
To: "David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Eric Dumazet <edumazet@...gle.com>,
Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>
CC: Benjamin Herrenschmidt <benh@...zon.com>,
Kuniyuki Iwashima <kuniyu@...zon.co.jp>,
Kuniyuki Iwashima <kuni1840@...il.com>, <bpf@...r.kernel.org>,
<netdev@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: [RFC PATCH bpf-next 7/8] bpf: Call bpf_run_sk_reuseport() for socket migration.
This patch makes it possible to select a new listener for socket migration
by eBPF.
The noteworthy point is that we select a listening socket in
reuseport_detach_sock() and reuseport_select_sock(), but we do not have
struct skb in the unhash path.
Since we cannot pass skb to the eBPF program, we run only the
BPF_PROG_TYPE_SK_REUSEPORT program by calling bpf_run_sk_reuseport() with
skb NULL. So, some fields derived from skb are also NULL in the eBPF
program.
Moreover, we can cancel migration by returning SK_DROP. This feature is
useful when listeners have different settings at the socket API level or
when we want to free resources as soon as possible.
Reviewed-by: Benjamin Herrenschmidt <benh@...zon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.co.jp>
---
net/core/filter.c | 26 +++++++++++++++++++++-----
net/core/sock_reuseport.c | 23 ++++++++++++++++++++---
net/ipv4/inet_hashtables.c | 2 +-
3 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 01e28f283962..ffc4591878b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8914,6 +8914,22 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
BPF_FIELD_SIZEOF(NS, NF), 0)
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF, SIZE, OFF) \
+ do { \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
+ si->src_reg, offsetof(S, F)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); \
+ *insn++ = BPF_LDX_MEM( \
+ SIZE, si->dst_reg, si->dst_reg, \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
+ target_size) \
+ + OFF); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD_OR_NULL(S, NS, F, NF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF, \
+ BPF_FIELD_SIZEOF(NS, NF), 0)
+
/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
* SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
*
@@ -9858,7 +9874,7 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
- reuse_kern->data_end = skb->data + skb_headlen(skb);
+ reuse_kern->data_end = skb ? skb->data + skb_headlen(skb) : NULL;
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
reuse_kern->bind_inany = reuse->bind_inany;
@@ -10039,10 +10055,10 @@ sk_reuseport_is_valid_access(int off, int size,
})
#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
- SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
- struct sk_buff, \
- skb, \
- SKB_FIELD)
+ SOCK_ADDR_LOAD_NESTED_FIELD_OR_NULL(struct sk_reuseport_kern, \
+ struct sk_buff, \
+ skb, \
+ SKB_FIELD)
#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 74a46197854b..903f78ab35c3 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -224,6 +224,7 @@ struct sock *reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
struct sock *nsk = NULL;
+ struct bpf_prog *prog;
int i;
spin_lock_bh(&reuseport_lock);
@@ -249,8 +250,16 @@ struct sock *reuseport_detach_sock(struct sock *sk)
reuse->socks[i] = reuse->socks[reuse->num_socks];
if (reuse->migrate_req) {
- if (reuse->num_socks)
- nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i];
+ if (reuse->num_socks) {
+ prog = rcu_dereference(reuse->prog);
+ if (prog && prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog,
+ NULL, sk->sk_hash);
+
+ if (!nsk)
+ nsk = i == reuse->num_socks ?
+ reuse->socks[i - 1] : reuse->socks[i];
+ }
reuse->num_closed_socks++;
reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk;
@@ -340,8 +349,16 @@ struct sock *reuseport_select_sock(struct sock *sk,
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- if (!prog || !skb)
+ if (!prog)
+ goto select_by_hash;
+
+ if (!skb) {
+ if (reuse->migrate_req &&
+ prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+
goto select_by_hash;
+ }
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f35c76cf3365..d981e4876679 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -647,7 +647,7 @@ void inet_unhash(struct sock *sk)
if (rcu_access_pointer(sk->sk_reuseport_cb)) {
nsk = reuseport_detach_sock(sk);
- if (nsk)
+ if (!IS_ERR_OR_NULL(nsk))
inet_csk_reqsk_queue_migrate(sk, nsk);
}
--
2.17.2 (Apple Git-113)
Powered by blists - more mailing lists