[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZckR-XOsULLI9EHc@shredder>
Date: Sun, 11 Feb 2024 20:29:13 +0200
From: Ido Schimmel <idosch@...sch.org>
To: Eric Dumazet <edumazet@...gle.com>
Cc: Jakub Kicinski <kuba@...nel.org>,
"David S . Miller" <davem@...emloft.net>,
Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org,
eric.dumazet@...il.com
Subject: Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement
rtnl_dump_ifinfo()
On Sat, Feb 10, 2024 at 12:23:30PM +0100, Eric Dumazet wrote:
> On Fri, Feb 9, 2024 at 11:24 PM Jakub Kicinski <kuba@...nel.org> wrote:
> >
> > On Fri, 9 Feb 2024 14:56:15 +0000 Eric Dumazet wrote:
> > > + unsigned long ifindex = cb->args[0];
> >
> > [snip]
> >
> > > + for_each_netdev_dump(tgt_net, dev, ifindex) {
> > > + if (link_dump_filtered(dev, master_idx, kind_ops))
> > > + continue;
> > > + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> > > + NETLINK_CB(cb->skb).portid,
> > > + nlh->nlmsg_seq, 0, flags,
> > > + ext_filter_mask, 0, NULL, 0,
> > > + netnsid, GFP_KERNEL);
> > > +
> > > + if (err < 0)
> > > + break;
> > > + cb->args[0] = ifindex + 1;
> >
> > Perhaps we can cast the context buffer onto something typed and use
> > it directly? I think it's a tiny bit less error prone:
> >
> > struct {
> > unsigned long ifindex;
> > } *ctx = (void *)cb->ctx;
> >
> > Then we can:
> >
> > for_each_netdev_dump(tgt_net, dev, ctx->ifindex)
> > ^^^^^^^^^^^^
> >
> > and not need to worry about saving the ifindex back to cb before
> > exiting.
>
> Hi Jakub
>
> I tried something like that (adding a common structure for future
> conversions), but this was not working properly.
>
> Unfortunately we only can save the ifindex back to cb->XXXX only after
> rtnl_fill_ifinfo() was a success.
>
> For instance, after applying the following diff to my patch, we have a
> bug, because iip link loops on the last device.
>
> We need to set cb->args[0] to last_dev->ifindex + 1 to end the dump.
I was looking into that in the past because of another rtnetlink dump
handler. See merge commit f8d3e0dc4b3a ("Merge branch
'nexthop-nexthop-dump-fixes'") and commit 913f60cacda7 ("nexthop: Fix
infinite nexthop dump when using maximum nexthop ID").
Basically, rtnetlink dump handlers always return a positive value if
some entries were filled in the skb to signal that more information
needs to be dumped, even if the dump is complete. I suspect this was
done to ensure that appending the NLMSG_DONE message will not fail, but
Jason fixed it in 2017 in commit 0642840b8bb0 ("af_netlink: ensure that
NLMSG_DONE never fails in dumps").
You can see that the dump is split across two buffers with only a single
netdev configured:
# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707673609, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 764
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 764
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++
The fake sentinel ('last_dev->ifindex + 1') is needed so that in the
second invocation of the dump handler it will not fill anything and then
return zero to signal that the dump is complete.
The following diff avoids this inefficiency and returns zero when the
dump is complete:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 31f433950c8d..4efd571a6a3f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2267,17 +2267,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0) {
if (likely(skb->len))
- goto out;
-
- goto out_err;
+ err = skb->len;
+ goto out;
}
cont:
idx++;
}
}
+
out:
- err = skb->len;
-out_err:
cb->args[1] = idx;
cb->args[0] = h;
cb->seq = tgt_net->dev_base_seq;
You can see that both messages (RTM_NEWLINK and NLMSG_DONE) are dumped
in a single buffer with this patch:
# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707674313, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++
And then it's possible to apply your patch with Jakub's suggestion:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4efd571a6a3f..dba13b31c58b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2188,25 +2188,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct rtnl_link_ops *kind_ops = NULL;
struct netlink_ext_ack *extack = cb->extack;
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct net *tgt_net = net;
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
+ unsigned int flags = NLM_F_MULTI;
struct nlattr *tb[IFLA_MAX+1];
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
+ struct net *tgt_net = net;
u32 ext_filter_mask = 0;
- const struct rtnl_link_ops *kind_ops = NULL;
- unsigned int flags = NLM_F_MULTI;
+ struct net_device *dev;
int master_idx = 0;
int netnsid = -1;
int err, i;
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
if (err < 0) {
if (cb->strict_check)
@@ -2250,34 +2247,22 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
flags |= NLM_F_DUMP_FILTERED;
walk_entries:
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &tgt_net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (link_dump_filtered(dev, master_idx, kind_ops))
- goto cont;
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_ifinfo(skb, dev, net,
- RTM_NEWLINK,
- NETLINK_CB(cb->skb).portid,
- nlh->nlmsg_seq, 0, flags,
- ext_filter_mask, 0, NULL, 0,
- netnsid, GFP_KERNEL);
-
- if (err < 0) {
- if (likely(skb->len))
- err = skb->len;
- goto out;
- }
-cont:
- idx++;
+ err = 0;
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
+ err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq, 0, flags,
+ ext_filter_mask, 0, NULL, 0,
+ netnsid, GFP_KERNEL);
+
+ if (err < 0) {
+ if (likely(skb->len))
+ err = skb->len;
+ break;
}
}
-
-out:
- cb->args[1] = idx;
- cb->args[0] = h;
cb->seq = tgt_net->dev_base_seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
if (netnsid >= 0)
And it will not hang:
# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707675119, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++
Powered by blists - more mailing lists