netdev - Re: [PATCH bpf-next v2 4/6] xdp: Add devmap_hash map type for looking up devices by hashed index

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAH3MdRU02Y_av7eRefUtopOthKib-8pzL-+5VT=G6G9STp1aqA@mail.gmail.com>
Date:   Mon, 8 Jul 2019 08:01:05 -0700
From:   Y Song <ys114321@...il.com>
To:     Toke Høiland-Jørgensen <toke@...hat.com>
Cc:     Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>,
        netdev <netdev@...r.kernel.org>,
        David Miller <davem@...emloft.net>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Jakub Kicinski <jakub.kicinski@...ronome.com>,
        Björn Töpel <bjorn.topel@...il.com>
Subject: Re: [PATCH bpf-next v2 4/6] xdp: Add devmap_hash map type for looking
 up devices by hashed index

On Mon, Jul 8, 2019 at 2:55 AM Toke Høiland-Jørgensen <toke@...hat.com> wrote:
>
> Y Song <ys114321@...il.com> writes:
>
> > On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@...hat.com> wrote:
> >>
> >> From: Toke Høiland-Jørgensen <toke@...hat.com>
> >>
> >> A common pattern when using xdp_redirect_map() is to create a device map
> >> where the lookup key is simply ifindex. Because device maps are arrays,
> >> this leaves holes in the map, and the map has to be sized to fit the
> >> largest ifindex, regardless of how many devices actually are actually
> >> needed in the map.
> >>
> >> This patch adds a second type of device map where the key is looked up
> >> using a hashmap, instead of being used as an array index. This allows maps
> >> to be densely packed, so they can be smaller.
> >>
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
> >> ---
> >>  include/linux/bpf.h        |    7 ++
> >>  include/linux/bpf_types.h  |    1
> >>  include/trace/events/xdp.h |    3 -
> >>  kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
> >>  kernel/bpf/verifier.c      |    2
> >>  net/core/filter.c          |    9 ++
> >>  6 files changed, 211 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index bfdb54dd2ad1..f9a506147c8a 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -713,6 +713,7 @@ struct xdp_buff;
> >>  struct sk_buff;
> >>
> >>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
> >>  void __dev_map_flush(struct bpf_map *map);
> >>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> >>                     struct net_device *dev_rx);
> >> @@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
> >>         return NULL;
> >>  }
> >>
> >> +static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
> >> +                                                            u32 key)
> >> +{
> >> +       return NULL;
> >> +}
> >> +
> >>  static inline void __dev_map_flush(struct bpf_map *map)
> >>  {
> >>  }
> >> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> >> index eec5aeeeaf92..36a9c2325176 100644
> >> --- a/include/linux/bpf_types.h
> >> +++ b/include/linux/bpf_types.h
> >> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
> >>  #ifdef CONFIG_NET
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
> >> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
> >>  #if defined(CONFIG_BPF_STREAM_PARSER)
> >>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> >> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> >> index 68899fdc985b..8c8420230a10 100644
> >> --- a/include/trace/events/xdp.h
> >> +++ b/include/trace/events/xdp.h
> >> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
> >>  #endif /* __DEVMAP_OBJ_TYPE */
> >>
> >>  #define devmap_ifindex(fwd, map)                               \
> >> -       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
> >> +       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
> >> +         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
> >>           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
> >>
> >>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
> >> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> >> index a2fe16362129..341af02f049d 100644
> >> --- a/kernel/bpf/devmap.c
> >> +++ b/kernel/bpf/devmap.c
> >> @@ -37,6 +37,12 @@
> >>   * notifier hook walks the map we know that new dev references can not be
> >>   * added by the user because core infrastructure ensures dev_get_by_index()
> >>   * calls will fail at this point.
> >> + *
> >> + * The devmap_hash type is a map type which interprets keys as ifindexes and
> >> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
> >> + * densely packed instead of having holes in the lookup array for unused
> >> + * ifindexes. The setup and packet enqueue/send code is shared between the two
> >> + * types of devmap; only the lookup and insertion is different.
> >>   */
> >>  #include <linux/bpf.h>
> >>  #include <net/xdp.h>
> >> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
> >>
> >>  struct bpf_dtab_netdev {
> >>         struct net_device *dev; /* must be first member, due to tracepoint */
> >> +       struct hlist_node index_hlist;
> >>         struct bpf_dtab *dtab;
> >>         unsigned int idx; /* keep track of map index for tracepoint */
> >>         struct xdp_bulk_queue __percpu *bulkq;
> >> @@ -70,11 +77,29 @@ struct bpf_dtab {
> >>         struct bpf_dtab_netdev **netdev_map;
> >>         struct list_head __percpu *flush_list;
> >>         struct list_head list;
> >> +
> >> +       /* these are only used for DEVMAP_HASH type maps */
> >> +       unsigned int items;
> >> +       struct hlist_head *dev_index_head;
> >> +       spinlock_t index_lock;
> >>  };
> >>
> >>  static DEFINE_SPINLOCK(dev_map_lock);
> >>  static LIST_HEAD(dev_map_list);
> >>
> >> +static struct hlist_head *dev_map_create_hash(void)
> >> +{
> >> +       int i;
> >> +       struct hlist_head *hash;
> >> +
> >> +       hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
> >> +       if (hash != NULL)
> >> +               for (i = 0; i < NETDEV_HASHENTRIES; i++)
> >> +                       INIT_HLIST_HEAD(&hash[i]);
> >> +
> >> +       return hash;
> >> +}
> >> +
> >>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>                             bool check_memlock)
> >>  {
> >> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
> >>         cost += sizeof(struct list_head) * num_possible_cpus();
> >>
> >> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
> >> +               cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
> >> +
> >>         /* if map size is larger than memlock limit, reject it */
> >>         err = bpf_map_charge_init(&dtab->map.memory, cost);
> >>         if (err)
> >> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> >>         if (!dtab->netdev_map)
> >>                 goto free_percpu;
> >>
> >> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
> >> +               dtab->dev_index_head = dev_map_create_hash();
> >> +               if (!dtab->dev_index_head)
> >> +                       goto free_map_area;
> >> +
> >> +               spin_lock_init(&dtab->index_lock);
> >> +       }
> >> +
> >>         return 0;
> >>
> >> +free_map_area:
> >> +       bpf_map_area_free(dtab->netdev_map);
> >>  free_percpu:
> >>         free_percpu(dtab->flush_list);
> >>  free_charge:
> >> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
> >>
> >>         free_percpu(dtab->flush_list);
> >>         bpf_map_area_free(dtab->netdev_map);
> >> +       kfree(dtab->dev_index_head);
> >>         kfree(dtab);
> >>  }
> >>
> >> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >>         return 0;
> >>  }
> >>
> >> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
> >> +                                                   int idx)
> >> +{
> >> +       return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
> >> +}
> >> +
> >> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
> >> +{
> >> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> +       struct hlist_head *head = dev_map_index_hash(dtab, key);
> >> +       struct bpf_dtab_netdev *dev;
> >> +
> >> +       hlist_for_each_entry_rcu(dev, head, index_hlist)
> >> +               if (dev->idx == key)
> >> +                       return dev;
> >> +
> >> +       return NULL;
> >> +}
> >> +
> >> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
> >> +                                   void *next_key)
> >> +{
> >> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> >> +       u32 idx, *next = next_key;
> >> +       struct bpf_dtab_netdev *dev, *next_dev;
> >> +       struct hlist_head *head;
> >> +       int i = 0;
> >> +
> >> +       if (!key)
> >> +               goto find_first;
> >> +
> >> +       idx = *(u32 *)key;
> >> +
> >> +       dev = __dev_map_hash_lookup_elem(map, idx);
> >> +       if (!dev)
> >> +               goto find_first;
> >> +
> >> +       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
> >> +                                   struct bpf_dtab_netdev, index_hlist);
> >
> > Just want to get a better understanding. Why do you want
> > hlist_entry_safe instead of hlist_entry?
>
> Erm, because the entry might not exist? The _safe variant just checks
> the list pointer before casting to the containing struct.

hlist_entry_safe is certainly correct. Just want to better understand when
the entry might not exist. By looking at hashtab.c implementation, it has
the same possible-null-entry assumption, so we should be ok here.

>
> > Also, maybe rcu_dereference instead of rcu_dereference_raw?
>
> Well, the _raw() variant comes from the equivalent function in
> hashtab.c, where I originally nicked most of this function, so think it
> makes more sense to keep them the same.

Okay.

>
> > dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.
> >
> >> +
> >> +       if (next_dev) {
> >> +               *next = next_dev->idx;
> >> +               return 0;
> >> +       }
> >> +
> >> +       i = idx & (NETDEV_HASHENTRIES - 1);
> >> +       i++;
> >> +
> >> + find_first:
> >> +       for (; i < NETDEV_HASHENTRIES; i++) {
> >> +               head = dev_map_index_hash(dtab, i);
> >> +
> >> +               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
> >> +                                           struct bpf_dtab_netdev,
> >> +                                           index_hlist);
> >
> > ditto. The same question as the above.
> >
> >> +               if (next_dev) {
> >> +                       *next = next_dev->idx;
> >> +                       return 0;
> >> +               }
> >> +       }
> >> +
> >> +       return -ENOENT;
> >> +}
> >> +
> >>  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
> >>                        bool in_napi_ctx)
> >>  {
> >> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
> >>         return dev ? &dev->ifindex : NULL;
> >>  }
> >>
> >> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
> >> +{
> >> +       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
> >> +                                                               *(u32 *)key);
> >> +       struct net_device *dev = obj ? obj->dev : NULL;
> >
> > When obj->dev could be NULL?
>
> Never. This could just as well be written as
>
> return obj ? obj->dev->ifindex : NULL;
>
> but writing it this way keeps it consistent with the existing
> dev_map_lookup_elem() function.

Okay then in order to be consistent with the existing code base.

Thanks for all the explanation!