[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iLtS3hG5BBHSi0yR_VPuG0p-Sdq+DigXah6MB54zxdw1g@mail.gmail.com>
Date: Sun, 11 Feb 2024 22:35:40 +0100
From: Eric Dumazet <edumazet@...gle.com>
To: Ido Schimmel <idosch@...sch.org>
Cc: Jakub Kicinski <kuba@...nel.org>, "David S . Miller" <davem@...emloft.net>,
Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org, eric.dumazet@...il.com
Subject: Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
On Sun, Feb 11, 2024 at 7:29 PM Ido Schimmel <idosch@...sch.org> wrote:
>
>
> I was looking into that in the past because of another rtnetlink dump
> handler. See merge commit f8d3e0dc4b3a ("Merge branch
> 'nexthop-nexthop-dump-fixes'") and commit 913f60cacda7 ("nexthop: Fix
> infinite nexthop dump when using maximum nexthop ID").
>
> Basically, rtnetlink dump handlers always return a positive value if
> some entries were filled in the skb to signal that more information
> needs to be dumped, even if the dump is complete. I suspect this was
> done to ensure that appending the NLMSG_DONE message will not fail, but
> Jason fixed it in 2017 in commit 0642840b8bb0 ("af_netlink: ensure that
> NLMSG_DONE never fails in dumps").
>
> You can see that the dump is split across two buffers with only a single
> netdev configured:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707673609, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 764
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 764
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++
>
> The fake sentinel ('last_dev->ifindex + 1') is needed so that in the
> second invocation of the dump handler it will not fill anything and then
> return zero to signal that the dump is complete.
>
> The following diff avoids this inefficiency and returns zero when the
> dump is complete:
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 31f433950c8d..4efd571a6a3f 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -2267,17 +2267,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>
> if (err < 0) {
> if (likely(skb->len))
> - goto out;
> -
> - goto out_err;
> + err = skb->len;
> + goto out;
> }
> cont:
> idx++;
> }
> }
> +
> out:
> - err = skb->len;
> -out_err:
> cb->args[1] = idx;
> cb->args[0] = h;
> cb->seq = tgt_net->dev_base_seq;
>
> You can see that both messages (RTM_NEWLINK and NLMSG_DONE) are dumped
> in a single buffer with this patch:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707674313, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++
>
> And then it's possible to apply your patch with Jakub's suggestion:
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 4efd571a6a3f..dba13b31c58b 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -2188,25 +2188,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
>
> static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
> {
> + const struct rtnl_link_ops *kind_ops = NULL;
> struct netlink_ext_ack *extack = cb->extack;
> const struct nlmsghdr *nlh = cb->nlh;
> struct net *net = sock_net(skb->sk);
> - struct net *tgt_net = net;
> - int h, s_h;
> - int idx = 0, s_idx;
> - struct net_device *dev;
> - struct hlist_head *head;
> + unsigned int flags = NLM_F_MULTI;
> struct nlattr *tb[IFLA_MAX+1];
> + struct {
> + unsigned long ifindex;
> + } *ctx = (void *)cb->ctx;
> + struct net *tgt_net = net;
> u32 ext_filter_mask = 0;
> - const struct rtnl_link_ops *kind_ops = NULL;
> - unsigned int flags = NLM_F_MULTI;
> + struct net_device *dev;
> int master_idx = 0;
> int netnsid = -1;
> int err, i;
>
> - s_h = cb->args[0];
> - s_idx = cb->args[1];
> -
> err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
> if (err < 0) {
> if (cb->strict_check)
> @@ -2250,34 +2247,22 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
> flags |= NLM_F_DUMP_FILTERED;
>
> walk_entries:
> - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
> - idx = 0;
> - head = &tgt_net->dev_index_head[h];
> - hlist_for_each_entry(dev, head, index_hlist) {
> - if (link_dump_filtered(dev, master_idx, kind_ops))
> - goto cont;
> - if (idx < s_idx)
> - goto cont;
> - err = rtnl_fill_ifinfo(skb, dev, net,
> - RTM_NEWLINK,
> - NETLINK_CB(cb->skb).portid,
> - nlh->nlmsg_seq, 0, flags,
> - ext_filter_mask, 0, NULL, 0,
> - netnsid, GFP_KERNEL);
> -
> - if (err < 0) {
> - if (likely(skb->len))
> - err = skb->len;
> - goto out;
> - }
> -cont:
> - idx++;
> + err = 0;
> + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
> + if (link_dump_filtered(dev, master_idx, kind_ops))
> + continue;
> + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> + NETLINK_CB(cb->skb).portid,
> + nlh->nlmsg_seq, 0, flags,
> + ext_filter_mask, 0, NULL, 0,
> + netnsid, GFP_KERNEL);
> +
> + if (err < 0) {
> + if (likely(skb->len))
> + err = skb->len;
> + break;
> }
> }
> -
> -out:
> - cb->args[1] = idx;
> - cb->args[0] = h;
> cb->seq = tgt_net->dev_base_seq;
> nl_dump_check_consistent(cb, nlmsg_hdr(skb));
> if (netnsid >= 0)
>
> And it will not hang:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707675119, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++
Excellent, thanks for helping me on this ;)
I am sending a V2 right away.
Powered by blists - more mailing lists