lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iK+yWD8jKwvRO_4-Kz7Qumk5WwmtwJDWMKDU2oLyDdGxA@mail.gmail.com>
Date: Fri, 23 Jun 2023 16:25:24 +0200
From: Eric Dumazet <edumazet@...gle.com>
To: Matthieu Baerts <matthieu.baerts@...sares.net>
Cc: mptcp@...ts.linux.dev, Mat Martineau <martineau@...nel.org>, 
	"David S. Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, 
	Shuah Khan <shuah@...nel.org>, netdev@...r.kernel.org, linux-kernel@...r.kernel.org, 
	linux-kselftest@...r.kernel.org
Subject: Re: [PATCH net-next 2/9] mptcp: track some aggregate data counters

On Tue, Jun 20, 2023 at 6:30 PM Matthieu Baerts
<matthieu.baerts@...sares.net> wrote:
>
> From: Paolo Abeni <pabeni@...hat.com>
>
> Currently there are no data transfer counters accounting for all
> the subflows used by a given MPTCP socket. The user-space can compute
> such figures aggregating the subflow info, but that is inaccurate
> if any subflow is closed before the MPTCP socket itself.
>
> Add the new counters in the MPTCP socket itself and expose them
> via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
> acquire the relevant locks before fetching the msk data, to ensure
> better data consistency
>
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> Reviewed-by: Matthieu Baerts <matthieu.baerts@...sares.net>
> Signed-off-by: Matthieu Baerts <matthieu.baerts@...sares.net>
> ---
>  include/uapi/linux/mptcp.h |  5 +++++
>  net/mptcp/options.c        | 10 ++++++++--
>  net/mptcp/protocol.c       | 11 ++++++++++-
>  net/mptcp/protocol.h       |  4 ++++
>  net/mptcp/sockopt.c        | 25 ++++++++++++++++++++-----
>  5 files changed, 47 insertions(+), 8 deletions(-)
>
> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> index 32af2d278cb4..a124be6ebbba 100644
> --- a/include/uapi/linux/mptcp.h
> +++ b/include/uapi/linux/mptcp.h
> @@ -123,6 +123,11 @@ struct mptcp_info {
>         __u8    mptcpi_local_addr_used;
>         __u8    mptcpi_local_addr_max;
>         __u8    mptcpi_csum_enabled;
> +       __u32   mptcpi_retransmits;
> +       __u64   mptcpi_bytes_retrans;
> +       __u64   mptcpi_bytes_sent;
> +       __u64   mptcpi_bytes_received;
> +       __u64   mptcpi_bytes_acked;
>  };
>
>  /*
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 4bdcd2b326bd..c254accb14de 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
>         return cur_seq;
>  }
>
> +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
> +{
> +       msk->bytes_acked += new_snd_una - msk->snd_una;
> +       msk->snd_una = new_snd_una;
> +}
> +
>  static void ack_update_msk(struct mptcp_sock *msk,
>                            struct sock *ssk,
>                            struct mptcp_options_received *mp_opt)
> @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
>                 __mptcp_check_push(sk, ssk);
>
>         if (after64(new_snd_una, old_snd_una)) {
> -               msk->snd_una = new_snd_una;
> +               __mptcp_snd_una_update(msk, new_snd_una);
>                 __mptcp_data_acked(sk);
>         }
>         mptcp_data_unlock(sk);
> @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
>                 /* on fallback we just need to ignore the msk-level snd_una, as
>                  * this is really plain TCP
>                  */
> -               msk->snd_una = READ_ONCE(msk->snd_nxt);
> +               __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
>
>                 __mptcp_data_acked(subflow->conn);
>                 mptcp_data_unlock(subflow->conn);
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9c756d675d4d..d5b8e488bce1 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
>
>         if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
>                 /* in sequence */
> +               msk->bytes_received += copy_len;
>                 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
>                 tail = skb_peek_tail(&sk->sk_receive_queue);
>                 if (tail && mptcp_try_coalesce(sk, tail, skb))
> @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
>                         MPTCP_SKB_CB(skb)->map_seq += delta;
>                         __skb_queue_tail(&sk->sk_receive_queue, skb);
>                 }
> +               msk->bytes_received += end_seq - msk->ack_seq;
>                 msk->ack_seq = end_seq;
>                 moved = true;
>         }
> @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
>          * that has been handed to the subflow for transmission
>          * and skip update in case it was old dfrag.
>          */
> -       if (likely(after64(snd_nxt_new, msk->snd_nxt)))
> +       if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
> +               msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
>                 msk->snd_nxt = snd_nxt_new;
> +       }
>  }
>
>  void mptcp_check_and_set_pending(struct sock *sk)
> @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk)
>         }
>         if (copied) {
>                 dfrag->already_sent = max(dfrag->already_sent, info.sent);
> +               msk->bytes_retrans += copied;
>                 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
>                          info.size_goal);
>                 WRITE_ONCE(msk->allow_infinite_fallback, false);
> @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
>         WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
>         mptcp_pm_data_reset(msk);
>         mptcp_ca_reset(sk);
> +       msk->bytes_acked = 0;
> +       msk->bytes_received = 0;
> +       msk->bytes_sent = 0;
> +       msk->bytes_retrans = 0;
>
>         WRITE_ONCE(sk->sk_shutdown, 0);
>         sk_error_report(sk);
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 47b46602870e..27adfcc5aaa2 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -262,10 +262,13 @@ struct mptcp_sock {
>         u64             local_key;
>         u64             remote_key;
>         u64             write_seq;
> +       u64             bytes_sent;
>         u64             snd_nxt;
> +       u64             bytes_received;
>         u64             ack_seq;
>         atomic64_t      rcv_wnd_sent;
>         u64             rcv_data_fin_seq;
> +       u64             bytes_retrans;
>         int             rmem_fwd_alloc;
>         struct sock     *last_snd;
>         int             snd_burst;
> @@ -274,6 +277,7 @@ struct mptcp_sock {
>                                                  * recovery related fields are under data_lock
>                                                  * protection
>                                                  */
> +       u64             bytes_acked;
>         u64             snd_una;
>         u64             wnd_end;
>         unsigned long   timer_ival;
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index e172a5848b0d..fa5055d5b029 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -889,7 +889,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
>
>  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>  {
> +       struct sock *sk = (struct sock *)msk;
>         u32 flags = 0;
> +       bool slow;
>
>         memset(info, 0, sizeof(*info));
>
> @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>         info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
>         info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
>
> +       if (inet_sk_state_load(sk) == TCP_LISTEN)
> +               return;
> +
>         /* The following limits only make sense for the in-kernel PM */
>         if (mptcp_pm_is_kernel(msk)) {
>                 info->mptcpi_subflows_max =
> @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>         if (READ_ONCE(msk->can_ack))
>                 flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
>         info->mptcpi_flags = flags;
> -       info->mptcpi_token = READ_ONCE(msk->token);
> -       info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> -       info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
> -       info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> -       info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
> +       mptcp_data_lock(sk);
> +       info->mptcpi_snd_una = msk->snd_una;
> +       info->mptcpi_rcv_nxt = msk->ack_seq;
> +       info->mptcpi_bytes_acked = msk->bytes_acked;
> +       mptcp_data_unlock(sk);
> +
> +       slow = lock_sock_fast(sk);

This causes a lockdep issue.

lock_sock_fast(sk) could sleep, if socket lock is owned by another process.

But we are called from a context where both a spin lock and
rcu_read_lock() are held.

BUG: sleeping function called from invalid context at net/core/sock.c:3549
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 316, name: syz-executor.4
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
7 locks held by syz-executor.4/316:
#0: ffffffff8e125408 (sock_diag_mutex){+.+.}-{3:3}, at:
sock_diag_rcv+0x1b/0x40 net/core/sock_diag.c:279
#1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
sock_diag_rcv_msg net/core/sock_diag.c:259 [inline]
#1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
sock_diag_rcv_msg+0x2d2/0x440 net/core/sock_diag.c:248
#2: ffff8880232bb688 (nlk_cb_mutex-SOCK_DIAG){+.+.}-{3:3}, at:
netlink_dump+0xbe/0xc50 net/netlink/af_netlink.c:2215
#3: ffffffff8e29a628 (inet_diag_table_mutex){+.+.}-{3:3}, at:
inet_diag_lock_handler+0x6e/0x100 net/ipv4/inet_diag.c:63
#4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:95 [inline]
#4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
mptcp_diag_dump+0x7c8/0x1330 net/mptcp/mptcp_diag.c:197
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
include/linux/spinlock.h:350 [inline]
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:98 [inline]
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
mptcp_diag_dump+0x838/0x1330 net/mptcp/mptcp_diag.c:197
#6: ffff88802c42a5f0 (msk_lock-AF_INET){+.+.}-{0:0}, at:
mptcp_diag_get_info+0x1ae/0x380 net/mptcp/mptcp_diag.c:224
Preemption disabled at:
[<0000000000000000>] 0x0

> +       info->mptcpi_csum_enabled = msk->csum_enabled;
> +       info->mptcpi_token = msk->token;
> +       info->mptcpi_write_seq = msk->write_seq;
> +       info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
> +       info->mptcpi_bytes_sent = msk->bytes_sent;
> +       info->mptcpi_bytes_received = msk->bytes_received;
> +       info->mptcpi_bytes_retrans = msk->bytes_retrans;
> +       unlock_sock_fast(sk, slow);
>  }
>  EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
>
>
> --
> 2.40.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ