[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180618181123.eczjeb3axd6sao57@kafai-mbp.dhcp.thefacebook.com>
Date: Mon, 18 Jun 2018 11:11:23 -0700
From: Martin KaFai Lau <kafai@...com>
To: <dsahern@...nel.org>
CC: <netdev@...r.kernel.org>, <borkmann@...earbox.net>,
<ast@...nel.org>, <davem@...emloft.net>,
David Ahern <dsahern@...il.com>
Subject: Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup
status
On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@...nel.org wrote:
> From: David Ahern <dsahern@...il.com>
>
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH, can you
give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
on other BPF_FIB_LKUP_RET_*?
> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
>
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
>
> Update the sample program per the change in API.
>
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
> include/uapi/linux/bpf.h | 28 ++++++++++++++----
> net/core/filter.c | 74 ++++++++++++++++++++++++++++++++--------------
> samples/bpf/xdp_fwd_kern.c | 8 ++---
> 3 files changed, 78 insertions(+), 32 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..ceb80071c341 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
> * is resolved), the nexthop address is returned in ipv4_dst
> * or ipv6_dst based on family, smac is set to mac address of
> * egress device, dmac is set to nexthop mac address, rt_metric
> - * is set to metric from route (IPv4/IPv6 only).
> + * is set to metric from route (IPv4/IPv6 only), and ifindex
> + * is set to the device index of the nexthop from the FIB lookup.
> *
> * *plen* argument is the size of the passed in struct.
> * *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,9 @@ union bpf_attr {
> * *ctx* is either **struct xdp_md** for XDP programs or
> * **struct sk_buff** tc cls_act programs.
> * Return
> - * Egress device index on success, 0 if packet needs to continue
> - * up the stack for further processing or a negative error in case
> - * of failure.
> + * < 0 if any input argument is invalid
> + * 0 on success (packet is forwarded and nexthop neighbor exists)
> + * > 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response
> *
> * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
> * Description
> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
> #define BPF_FIB_LOOKUP_DIRECT BIT(0)
> #define BPF_FIB_LOOKUP_OUTPUT BIT(1)
>
> +enum {
> + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
> + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed */
> + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable */
> + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed */
> + BPF_FIB_LKUP_RET_NOT_FWDED, /* pkt is not forwardded */
BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
> + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires unsupported encap */
> + BPF_FIB_LKUP_RET_NO_NHDEV, /* nh device does not exist */
> + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neigh entry for nh */
> + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* pkt too big to fwd */
> +};
> +
> struct bpf_fib_lookup {
> /* input: network family for lookup (AF_INET, AF_INET6)
> * output: network family of egress nexthop
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>
> /* total length of packet from network header - used for MTU check */
> __u16 tot_len;
> - __u32 ifindex; /* L3 device index for lookup */
> +
> + /* input: L3 device index for lookup
> + * output: nexthop device index from FIB lookup
> + */
> + __u32 ifindex;
>
> union {
> /* inputs to lookup */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e7f12e9f598c..e758ca487878 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> params->h_vlan_TCI = 0;
> params->h_vlan_proto = 0;
> + params->ifindex = dev->ifindex;
>
> - return dev->ifindex;
> + return 0;
> }
> #endif
>
> @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> /* verify forwarding is enabled on this interface */
> in_dev = __in_dev_get_rcu(dev);
> if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> - return 0;
> + return BPF_FIB_LKUP_RET_FWD_DISABLED;
>
> if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> fl4.flowi4_iif = 1;
> @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>
> tb = fib_get_table(net, tbid);
> if (unlikely(!tb))
> - return 0;
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
>
> err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> } else {
> @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> }
>
> - if (err || res.type != RTN_UNICAST)
> - return 0;
> + if (err) {
> + /* map fib lookup errors to RTN_ type */
> + if (err == -EINVAL)
> + return BPF_FIB_LKUP_RET_BLACKHOLE;
> + if (err == -EHOSTUNREACH)
> + return BPF_FIB_LKUP_RET_UNREACHABLE;
> + if (err == -EACCES)
> + return BPF_FIB_LKUP_RET_PROHIBIT;
> +
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
> + }
> +
> + if (res.type != RTN_UNICAST)
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
>
> if (res.fi->fib_nhs > 1)
> fib_select_path(net, &res, &fl4, NULL);
> @@ -4144,18 +4157,18 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> if (check_mtu) {
> mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
> if (params->tot_len > mtu)
> - return 0;
> + return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> }
>
> nh = &res.fi->fib_nh[res.nh_sel];
>
> /* do not handle lwt encaps right now */
> if (nh->nh_lwtstate)
> - return 0;
> + return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>
> dev = nh->nh_dev;
> if (unlikely(!dev))
> - return 0;
> + return BPF_FIB_LKUP_RET_NO_NHDEV;
>
> if (nh->nh_gw)
> params->ipv4_dst = nh->nh_gw;
> @@ -4166,10 +4179,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> * rcu_read_lock_bh is not needed here
> */
> neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> - if (neigh)
> - return bpf_fib_set_fwd_params(params, neigh, dev);
> + if (!neigh)
> + return BPF_FIB_LKUP_RET_NO_NEIGH;
>
> - return 0;
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> }
> #endif
>
> @@ -4190,7 +4203,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>
> /* link local addresses are never forwarded */
> if (rt6_need_strict(dst) || rt6_need_strict(src))
> - return 0;
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
>
> dev = dev_get_by_index_rcu(net, params->ifindex);
> if (unlikely(!dev))
> @@ -4198,7 +4211,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>
> idev = __in6_dev_get_safely(dev);
> if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
> - return 0;
> + return BPF_FIB_LKUP_RET_FWD_DISABLED;
>
> if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> fl6.flowi6_iif = 1;
> @@ -4225,7 +4238,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>
> tb = ipv6_stub->fib6_get_table(net, tbid);
> if (unlikely(!tb))
> - return 0;
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
>
> f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> } else {
> @@ -4238,11 +4251,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> }
>
> if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> - return 0;
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
> +
> + if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
> + switch (f6i->fib6_type) {
> + case RTN_BLACKHOLE:
> + return BPF_FIB_LKUP_RET_BLACKHOLE;
> + case RTN_UNREACHABLE:
> + return BPF_FIB_LKUP_RET_UNREACHABLE;
> + case RTN_PROHIBIT:
> + return BPF_FIB_LKUP_RET_PROHIBIT;
> + default:
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
> + }
> + }
>
> - if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> - f6i->fib6_type != RTN_UNICAST))
> - return 0;
> + if (f6i->fib6_type != RTN_UNICAST)
> + return BPF_FIB_LKUP_RET_NOT_FWDED;
>
> if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> if (check_mtu) {
> mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
> if (params->tot_len > mtu)
> - return 0;
> + return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> }
>
> if (f6i->fib6_nh.nh_lwtstate)
> - return 0;
> + return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>
> if (f6i->fib6_flags & RTF_GATEWAY)
> *dst = f6i->fib6_nh.nh_gw;
>
> dev = f6i->fib6_nh.nh_dev;
> + if (unlikely(!dev))
> + return BPF_FIB_LKUP_RET_NO_NHDEV;
Is this a bug fix?
> +
> params->rt_metric = f6i->fib6_metric;
>
> /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> @@ -4270,10 +4298,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> */
> neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> ndisc_hashfn, dst, dev);
> - if (neigh)
> - return bpf_fib_set_fwd_params(params, neigh, dev);
> + if (!neigh)
> + return BPF_FIB_LKUP_RET_NO_NEIGH;
>
> - return 0;
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> }
> #endif
>
> diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
> index 6673cdb9f55c..a7e94e7ff87d 100644
> --- a/samples/bpf/xdp_fwd_kern.c
> +++ b/samples/bpf/xdp_fwd_kern.c
> @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
> struct ethhdr *eth = data;
> struct ipv6hdr *ip6h;
> struct iphdr *iph;
> - int out_index;
> u16 h_proto;
> u64 nh_off;
> + int rc;
>
> nh_off = sizeof(*eth);
> if (data + nh_off > data_end)
> @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>
> fib_params.ifindex = ctx->ingress_ifindex;
>
> - out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
> + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
>
> /* verify egress index has xdp support
> * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
> @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
> * NOTE: without verification that egress index supports XDP
> * forwarding packets are dropped.
> */
> - if (out_index > 0) {
> + if (rc == 0) {
> if (h_proto == htons(ETH_P_IP))
> ip_decrease_ttl(iph);
> else if (h_proto == htons(ETH_P_IPV6))
> @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>
> memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
> memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
> - return bpf_redirect_map(&tx_port, out_index, 0);
> + return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
> }
>
> return XDP_PASS;
> --
> 2.11.0
>
Powered by blists - more mailing lists