[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f2bbefed-f17c-4127-a87b-13a5933e98cd@ovn.org>
Date: Tue, 9 Apr 2024 11:35:04 +0200
From: Ilya Maximets <i.maximets@....org>
To: Adrian Moreno <amorenoz@...hat.com>, netdev@...r.kernel.org
Cc: i.maximets@....org, jiri@...nulli.us, xiyou.wangcong@...il.com,
cmi@...dia.com, yotam.gi@...il.com, aconole@...hat.com, echaudro@...hat.com,
horms@...nel.org
Subject: Re: [RFC net-next v2 5/5] net:openvswitch: add psample support
On 4/9/24 10:16, Adrian Moreno wrote:
>
>
> On 4/8/24 22:40, Ilya Maximets wrote:
>> On 4/8/24 21:48, Adrian Moreno wrote:
>>>
>>>
>>> On 4/8/24 15:37, Ilya Maximets wrote:
>>>> On 4/8/24 14:57, Adrian Moreno wrote:
>>>>> Add a new attribute to the sample action, called
>>>>> OVS_SAMPLE_ATTR_PSAMPLE to allow userspace to pass a group_id and a
>>>>> user-defined cookie.
>>>>>
>>>>> The maximum length of the user-defined cookie is set to 16, same as
>>>>> tc_cookie to discourage using cookies that will not be offloadable.
>>>>>
>>>>> When set, the sample action will use psample to multicast the sample.
>>>>>
>>>>> Signed-off-by: Adrian Moreno <amorenoz@...hat.com>
>>>>> ---
>>>>> include/uapi/linux/openvswitch.h | 22 +++++++--
>>>>> net/openvswitch/actions.c | 52 ++++++++++++++++++---
>>>>> net/openvswitch/datapath.c | 2 +-
>>>>> net/openvswitch/flow_netlink.c | 78 +++++++++++++++++++++++++-------
>>>>> 4 files changed, 127 insertions(+), 27 deletions(-)
>>>>
>>>> This cpatch is missing a few bits:
>>>> - Update for Documentation/netlink/specs/ovs_flow.yaml
>>>> - Maybe update for tools/testing/selftests/net/openvswitch/ovs-dpctl.py
>>>> - Maybe some basic selftests.
>>>>
>>>
>>> Absolutely. I surely plan to add it on the first non-rfc version.
>>>
>>>>>
>>>>> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
>>>>> index efc82c318fa2..a5a32588f582 100644
>>>>> --- a/include/uapi/linux/openvswitch.h
>>>>> +++ b/include/uapi/linux/openvswitch.h
>>>>> @@ -646,15 +646,24 @@ enum ovs_flow_attr {
>>>>> * %UINT32_MAX samples all packets and intermediate values sample intermediate
>>>>> * fractions of packets.
>>>>> * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event.
>>>>> - * Actions are passed as nested attributes.
>>>>> + * Actions are passed as nested attributes. Optional if OVS_SAMPLE_ATTR_PSAMPLE
>>>>> + * is not set.
>>>>
>>>> 'is set' probably. >
>>>>> + * @OVS_SAMPLE_ATTR_PSAMPLE: Arguments to be passed to psample. Optional if
>>>>> + * OVS_SAMPLE_ATTR_ACTIONS is not set.
>>>>
>>>> Same here.
>>>>
>>>
>>> Good catch, I rewrote those comments a bunch of times and the were left
>>> expressing the exact opposite!
>>>
>>>
>>>>> *
>>>>> - * Executes the specified actions with the given probability on a per-packet
>>>>> - * basis.
>>>>> + * Either OVS_SAMPLE_ATTR_USER_COOKIE or OVS_SAMPLE_ATTR_USER_COOKIE must be
>>>>> + * specified.
>>>>> + *
>>>>> + * Executes the specified actions and/or sends the packet to psample
>>>>> + * with the given probability on a per-packet basis.
>>>>> */
>>>>> enum ovs_sample_attr {
>>>>> OVS_SAMPLE_ATTR_UNSPEC,
>>>>> OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
>>>>> OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
>>>>> + OVS_SAMPLE_ATTR_PSAMPLE, /* struct ovs_psample followed
>>>>> + * by the user-provided cookie.
>>>>> + */
>>>>> __OVS_SAMPLE_ATTR_MAX,
>>>>>
>>>>> #ifdef __KERNEL__
>>>>> @@ -675,6 +684,13 @@ struct sample_arg {
>>>>> };
>>>>> #endif
>>>>>
>>>>> +#define OVS_PSAMPLE_COOKIE_MAX_SIZE 16
>>>>> +struct ovs_psample {
>>>>> + __u32 group_id; /* The group used for packet sampling. */
>>>>> + __u32 user_cookie_len; /* The length of the user-provided cookie. */
>>
>> Here as well, not sure if u32 makes sense as userspace can't
>> actually supply a cookie this large via netlink.
>>
>>>>> + __u8 user_cookie[]; /* The user-provided cookie. */
>>>>> +};
>>>>
>>>> Structures are not a good approach for modern netlink.
>>>> use nested attributes instead. This way we can also
>>>> eliminate the need for variable-length array and the
>>>> length field, if the length can be taken from a netlink
>>>> attribute directly, e.g. similar to NLA_BINARY in tc.
>>>>
>>>> If necessary, there could be a structure in the private
>>>> header to store the data for internal use.
>>>>
>>>
>>> Interesting. Thanks for the suggestion. I think it will also help action validation.
>>>
>>>>> +
>>>>> /**
>>>>> * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
>>>>> * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
>>>>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>>>>> index 6fcd7e2ca81f..45d2b325b76a 100644
>>>>> --- a/net/openvswitch/actions.c
>>>>> +++ b/net/openvswitch/actions.c
>>>>> @@ -24,6 +24,7 @@
>>>>> #include <net/checksum.h>
>>>>> #include <net/dsfield.h>
>>>>> #include <net/mpls.h>
>>>>> +#include <net/psample.h>
>>>>> #include <net/sctp/checksum.h>
>>>>>
>>>>> #include "datapath.h"
>>>>> @@ -1025,6 +1026,31 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb,
>>>>> return 0;
>>>>> }
>>>>>
>>>>> +static int ovs_psample_packet(struct datapath *dp, struct sw_flow_key *key,
>>>>> + struct ovs_psample *psample, struct sk_buff *skb,
>>>>> + u32 rate)
>>>>> +{
>>>>> + struct psample_group psample_group = {};
>>>>> + struct psample_metadata md = {};
>>>>> + struct vport *input_vport;
>>>>> +
>>>>> + psample_group.group_num = psample->group_id;
>>>>> + psample_group.net = ovs_dp_get_net(dp);
>>>>> +
>>>>> + input_vport = ovs_vport_rcu(dp, key->phy.in_port);
>>>>> + if (!input_vport)
>>>>> + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
>>>>> +
>>>>> + md.in_ifindex = input_vport->dev->ifindex;
>>>>> + md.user_cookie = psample->user_cookie;
>>>>> + md.user_cookie_len = psample->user_cookie_len;
>>>>> + md.trunc_size = skb->len;
>>>>> +
>>>>> + psample_sample_packet(&psample_group, skb, rate, &md);
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> /* When 'last' is true, sample() should always consume the 'skb'.
>>>>> * Otherwise, sample() should keep 'skb' intact regardless what
>>>>> * actions are executed within sample().
>>>>> @@ -1033,16 +1059,17 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
>>>>> struct sw_flow_key *key, const struct nlattr *attr,
>>>>> bool last)
>>>>> {
>>>>> - struct nlattr *actions;
>>>>> + const struct sample_arg *arg;
>>>>> struct nlattr *sample_arg;
>>>>> int rem = nla_len(attr);
>>>>> - const struct sample_arg *arg;
>>>>> + struct nlattr *next;
>>>>> bool clone_flow_key;
>>>>> + int ret;
>>>>>
>>>>> /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
>>>>> sample_arg = nla_data(attr);
>>>>> arg = nla_data(sample_arg);
>>>>> - actions = nla_next(sample_arg, &rem);
>>>>> + next = nla_next(sample_arg, &rem);
>>>>>
>>>>> if ((arg->probability != U32_MAX) &&
>>>>> (!arg->probability || get_random_u32() > arg->probability)) {
>>>>> @@ -1051,9 +1078,22 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
>>>>> return 0;
>>>>> }
>>>>>
>>>>> - clone_flow_key = !arg->exec;
>>>>> - return clone_execute(dp, skb, key, 0, actions, rem, last,
>>>>> - clone_flow_key);
>>>>> + if (next->nla_type == OVS_SAMPLE_ATTR_PSAMPLE) {
>>>>
>>>> Maybe add a commnet that OVS_SAMPLE_ATTR_PSAMPLE is always a sencond
>>>> argument when present.
>>>>
>>>> Is there a better way to handle this?
>>>>
>>>
>>> I also dislike it. The fact that actions are not nested but concatenated to
>>> makes the internal representation a bit flaky.
>>>
>>> The alternative I considered was adding the group_id and cookie to the
>>> internal-only OVS_SAMPLE_ATTR_ARG. However, now I wonder:
>>> - Should we also use nested attributes instead of structs for this internal one?
>>>
>>> And, probably off-topic: what's the story behind using netlink attributes to
>>> store action arguments internally? Has it ever been discussed using a union for
>>> instance?
>>
>> I'm not sure why it was originally done this way, but it is probably
>> the most efficient of convenient ways to pack a tree-like structure.
>> If we had a union, we would likely need a tree of actions with each
>> action bing a separately allocated node, since we have clone() and
>> even check_pkt_len() or other actions that can fork the pipeline.
>> Or we'll need to use some dummy actions as parethesis, prectically
>> emulating what we already have with netlink, but with structures.
>>
>> Netlink format is fast to scan, since it's in a single liner chunk
>> of memory. Tree or list structure may have higher memory footprint
>> and be slower to iterate due to cache misses.
>>
>
> That's true. Iteration over a number of actions is easy in netlink. And nesting
> easily enables tree-like actions. But in the context of:
>
> > Structures are not a good approach for modern netlink. use nested attributes
> instead.
>
> If we try to implement all our actions following this way, and we keep just
> copying the incoming actions into the internal representation, we incur in
> unnecessary memory overhead (e.g: storing 2x struct nlaattr + padding of extra
> memory to store 2 integers).
>
> I don't want to derail the discussion into historical or futuristic changes,
> just saying that the approach taken in the SAMPLE action (not including this
> patch) of exposing arguments as attributes but having a kernel-only struct to
> store them seems to me a good compromise.
Sure. As I said, it's fine to have internal structures. My comment
was mainly about uAPI part. We should avoid structures in uAPI if
possible, as they are very hard to maintain and keep compatible with
older userspace in case some changes will be needed in the future.
>
>
>> There might be a better way to store all this for sure, but will
>> require some careful performance and memory consumption testing.
>>
>>>
>>>>> + ret = ovs_psample_packet(dp, key, nla_data(next), skb,
>>>>> + arg->probability);
>>>>> + if (last)
>>>>> + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION);
>>>>> + if (ret)
>>>>> + return ret;
>>>>> + next = nla_next(next, &rem);
>>>>> + }
>>>>> +
>>>>> + if (nla_ok(next, rem)) {
>>>>> + clone_flow_key = !arg->exec;
>>>>> + ret = clone_execute(dp, skb, key, 0, next, rem, last,
>>>>> + clone_flow_key);
>>>>> + }
>>>>> + return ret;
>>>>> }
>>>>>
>>>>> /* When 'last' is true, clone() should always consume the 'skb'.
>>>>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>>>>> index 99d72543abd3..b5b560c2e74b 100644
>>>>> --- a/net/openvswitch/datapath.c
>>>>> +++ b/net/openvswitch/datapath.c
>>>>> @@ -976,7 +976,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
>>>>> struct sw_flow_match match;
>>>>> u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
>>>>> int error;
>>>>> - bool log = !a[OVS_FLOW_ATTR_PROBE];
>>>>> + bool log = true;
>>>>
>>>> Debugging artifact?
>>>>
>>>
>>> Yep, sorry.
>>>
>>>>>
>>>>> /* Must have key and actions. */
>>>>> error = -EINVAL;
>>>>> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
>>>>> index f224d9bcea5e..f540686271b7 100644
>>>>> --- a/net/openvswitch/flow_netlink.c
>>>>> +++ b/net/openvswitch/flow_netlink.c
>>>>> @@ -2381,8 +2381,12 @@ static void ovs_nla_free_sample_action(const struct nlattr *action)
>>>>>
>>>>> switch (nla_type(a)) {
>>>>> case OVS_SAMPLE_ATTR_ARG:
>>>>> - /* The real list of actions follows this attribute. */
>>>>
>>>> Please, don't remove this comment. Maybe extend it instead.
>>>>
>>>>> a = nla_next(a, &rem);
>>>>> +
>>>>> + /* OVS_SAMPLE_ATTR_PSAMPLE may be present. */
>>>>> + if (nla_type(a) == OVS_SAMPLE_ATTR_PSAMPLE)
>>>>> + a = nla_next(a, &rem);
>>>>> +
>>>>> ovs_nla_free_nested_actions(a, rem);
>>>>> break;
>>>>> }
>>>>> @@ -2561,6 +2565,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
>>>>> u32 mpls_label_count, bool log,
>>>>> u32 depth);
>>>>>
>>>>> +static int copy_action(const struct nlattr *from,
>>>>> + struct sw_flow_actions **sfa, bool log);
>>>>> +
>>>>> static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
>>>>> const struct sw_flow_key *key,
>>>>> struct sw_flow_actions **sfa,
>>>>> @@ -2569,10 +2576,10 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
>>>>> u32 depth)
>>>>> {
>>>>> const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
>>>>> - const struct nlattr *probability, *actions;
>>>>> + const struct nlattr *probability, *actions, *psample;
>>>>> const struct nlattr *a;
>>>>> - int rem, start, err;
>>>>> struct sample_arg arg;
>>>>> + int rem, start, err;
>>>>>
>>>>> memset(attrs, 0, sizeof(attrs));
>>>>> nla_for_each_nested(a, attr, rem) {
>>>>> @@ -2589,7 +2596,23 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
>>>>> return -EINVAL;
>>>>>
>>>>> actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
>>>>> - if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
>>>>> + if (actions && (!nla_len(actions) || nla_len(actions) < NLA_HDRLEN))
>>>>> + return -EINVAL;
>>>>> +
>>>>> + psample = attrs[OVS_SAMPLE_ATTR_PSAMPLE];
>>>>> + if (psample) {
>>>>> + struct ovs_psample *ovs_ps;
>>>>> +
>>>>> + if (!nla_len(psample) || nla_len(psample) < sizeof(*ovs_ps))
>>>>> + return -EINVAL;
>>>>> +
>>>>> + ovs_ps = nla_data(psample);
>>>>> + if (ovs_ps->user_cookie_len > OVS_PSAMPLE_COOKIE_MAX_SIZE ||
>>>>> + nla_len(psample) != sizeof(*ovs_ps) + ovs_ps->user_cookie_len)
>>>>> + return -EINVAL;
>>>>> + }
>>>>> +
>>>>> + if (!psample && !actions)
>>>>> return -EINVAL;
>>>>>
>>>>> /* validation done, copy sample action. */
>>>>> @@ -2608,7 +2631,9 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
>>>>> * If the sample is the last action, it can always be excuted
>>>>> * rather than deferred.
>>>>> */
>>>>> - arg.exec = last || !actions_may_change_flow(actions);
>>>>> + if (actions)
>>>>> + arg.exec = last || !actions_may_change_flow(actions);
>>>>
>>>> 'arg.exec' will remain uninitialized.
>>>>
>>>
>>> Yes. I just wanted to avoid the call to actions_may_change_flow(NULL) in which
>>> case arg.exec is not used. I'll make sure to zero-initialize it nevertheless.
>>>
>>>>> +
>>>>> arg.probability = nla_get_u32(probability);
>>>>>
>>>>> err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg),
>>>>> @@ -2616,10 +2641,17 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
>>>>> if (err)
>>>>> return err;
>>>>>
>>>>> - err = __ovs_nla_copy_actions(net, actions, key, sfa,
>>>>> - eth_type, vlan_tci, mpls_label_count, log,
>>>>> - depth + 1);
>>>>> + if (psample)
>>>>> + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PSAMPLE,
>>>>> + nla_data(psample), nla_len(psample),
>>>>> + log);
>>>>> + if (err)
>>>>
>>>> Can be used uninitialized.
>>>>
>>>
>>> You're right, it should be inside the if above, although err should have been
>>> initialized to output of ovs_nla_add_action.
>>
>> Ah, I missed the previous assingnent. But yes, it is still a little strange
>> to handle errors this way.
>>
>>>
>>>>> + return err;
>>>>>
>>>>> + if (actions)
>>>>> + err = __ovs_nla_copy_actions(net, actions, key, sfa,
>>>>> + eth_type, vlan_tci,
>>>>> + mpls_label_count, log, depth + 1);
>>>>> if (err)
>>>>> return err;
>>>>>
>>>>> @@ -3538,7 +3570,7 @@ static int sample_action_to_attr(const struct nlattr *attr,
>>>>> struct nlattr *start, *ac_start = NULL, *sample_arg;
>>>>> int err = 0, rem = nla_len(attr);
>>>>> const struct sample_arg *arg;
>>>>> - struct nlattr *actions;
>>>>> + struct nlattr *next;
>>>>>
>>>>> start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SAMPLE);
>>>>> if (!start)
>>>>> @@ -3546,27 +3578,39 @@ static int sample_action_to_attr(const struct nlattr *attr,
>>>>>
>>>>> sample_arg = nla_data(attr);
>>>>> arg = nla_data(sample_arg);
>>>>> - actions = nla_next(sample_arg, &rem);
>>>>> + next = nla_next(sample_arg, &rem);
>>>>>
>>>>> if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) {
>>>>> err = -EMSGSIZE;
>>>>> goto out;
>>>>> }
>>>>>
>>>>> - ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS);
>>>>> - if (!ac_start) {
>>>>> - err = -EMSGSIZE;
>>>>> - goto out;
>>>>> + if (nla_type(next) == OVS_SAMPLE_ATTR_PSAMPLE) {
>>>>> + if (nla_put(skb, OVS_SAMPLE_ATTR_PSAMPLE, nla_len(next),
>>>>> + nla_data(next))) {
>>>>> + err = -EMSGSIZE;
>>>>> + goto out;
>>>>> + }
>>>>> + next = nla_next(next, &rem);
>>>>> }
>>>>>
>>>>> - err = ovs_nla_put_actions(actions, rem, skb);
>>>>> + if (nla_ok(next, rem)) {
>>>>> + ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS);
>>>>> + if (!ac_start) {
>>>>> + err = -EMSGSIZE;
>>>>> + goto out;
>>>>> + }
>>>>> + err = ovs_nla_put_actions(next, rem, skb);
>>>>> + }
>>>>>
>>>>> out:
>>>>> if (err) {
>>>>> - nla_nest_cancel(skb, ac_start);
>>>>> + if (ac_start)
>>>>> + nla_nest_cancel(skb, ac_start);
>>>>> nla_nest_cancel(skb, start);
>>>>> } else {
>>>>> - nla_nest_end(skb, ac_start);
>>>>> + if (ac_start)
>>>>> + nla_nest_end(skb, ac_start);
>>>>> nla_nest_end(skb, start);
>>>>> }
>>>>>
>>>>
>>>
>>
>
Powered by blists - more mailing lists