netdev - Re: [PATCH bpf-next v2 4/6] xdp: Add devmap_hash map type for looking up devices by hashed index

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <874l3w26lb.fsf@toke.dk>
Date:   Mon, 08 Jul 2019 11:55:44 +0200
From:   Toke Høiland-Jørgensen <toke@...hat.com>
To:     Y Song <ys114321@...il.com>
Cc:     Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>,
        netdev <netdev@...r.kernel.org>,
        David Miller <davem@...emloft.net>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Jakub Kicinski <jakub.kicinski@...ronome.com>,
        Björn Töpel <bjorn.topel@...il.com>
Subject: Re: [PATCH bpf-next v2 4/6] xdp: Add devmap_hash map type for looking up devices by hashed index

Y Song <ys114321@...il.com> writes:

> On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@...hat.com> wrote:
>>
>> From: Toke Høiland-Jørgensen <toke@...hat.com>
>>
>> A common pattern when using xdp_redirect_map() is to create a device map
>> where the lookup key is simply ifindex. Because device maps are arrays,
>> this leaves holes in the map, and the map has to be sized to fit the
>> largest ifindex, regardless of how many devices actually are actually
>> needed in the map.
>>
>> This patch adds a second type of device map where the key is looked up
>> using a hashmap, instead of being used as an array index. This allows maps
>> to be densely packed, so they can be smaller.
>>
>> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
>> ---
>>  include/linux/bpf.h        |    7 ++
>>  include/linux/bpf_types.h  |    1
>>  include/trace/events/xdp.h |    3 -
>>  kernel/bpf/devmap.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/bpf/verifier.c      |    2
>>  net/core/filter.c          |    9 ++
>>  6 files changed, 211 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index bfdb54dd2ad1..f9a506147c8a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -713,6 +713,7 @@ struct xdp_buff;
>>  struct sk_buff;
>>
>>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
>> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
>>  void __dev_map_flush(struct bpf_map *map);
>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>>                     struct net_device *dev_rx);
>> @@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>>         return NULL;
>>  }
>>
>> +static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
>> +                                                            u32 key)
>> +{
>> +       return NULL;
>> +}
>> +
>>  static inline void __dev_map_flush(struct bpf_map *map)
>>  {
>>  }
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index eec5aeeeaf92..36a9c2325176 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
>>  #ifdef CONFIG_NET
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
>> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
>>  #if defined(CONFIG_BPF_STREAM_PARSER)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>> index 68899fdc985b..8c8420230a10 100644
>> --- a/include/trace/events/xdp.h
>> +++ b/include/trace/events/xdp.h
>> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
>>  #endif /* __DEVMAP_OBJ_TYPE */
>>
>>  #define devmap_ifindex(fwd, map)                               \
>> -       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
>> +       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
>> +         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
>>           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
>>
>>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index a2fe16362129..341af02f049d 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -37,6 +37,12 @@
>>   * notifier hook walks the map we know that new dev references can not be
>>   * added by the user because core infrastructure ensures dev_get_by_index()
>>   * calls will fail at this point.
>> + *
>> + * The devmap_hash type is a map type which interprets keys as ifindexes and
>> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
>> + * densely packed instead of having holes in the lookup array for unused
>> + * ifindexes. The setup and packet enqueue/send code is shared between the two
>> + * types of devmap; only the lookup and insertion is different.
>>   */
>>  #include <linux/bpf.h>
>>  #include <net/xdp.h>
>> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
>>
>>  struct bpf_dtab_netdev {
>>         struct net_device *dev; /* must be first member, due to tracepoint */
>> +       struct hlist_node index_hlist;
>>         struct bpf_dtab *dtab;
>>         unsigned int idx; /* keep track of map index for tracepoint */
>>         struct xdp_bulk_queue __percpu *bulkq;
>> @@ -70,11 +77,29 @@ struct bpf_dtab {
>>         struct bpf_dtab_netdev **netdev_map;
>>         struct list_head __percpu *flush_list;
>>         struct list_head list;
>> +
>> +       /* these are only used for DEVMAP_HASH type maps */
>> +       unsigned int items;
>> +       struct hlist_head *dev_index_head;
>> +       spinlock_t index_lock;
>>  };
>>
>>  static DEFINE_SPINLOCK(dev_map_lock);
>>  static LIST_HEAD(dev_map_list);
>>
>> +static struct hlist_head *dev_map_create_hash(void)
>> +{
>> +       int i;
>> +       struct hlist_head *hash;
>> +
>> +       hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
>> +       if (hash != NULL)
>> +               for (i = 0; i < NETDEV_HASHENTRIES; i++)
>> +                       INIT_HLIST_HEAD(&hash[i]);
>> +
>> +       return hash;
>> +}
>> +
>>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>                             bool check_memlock)
>>  {
>> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>>         cost += sizeof(struct list_head) * num_possible_cpus();
>>
>> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
>> +               cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
>> +
>>         /* if map size is larger than memlock limit, reject it */
>>         err = bpf_map_charge_init(&dtab->map.memory, cost);
>>         if (err)
>> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>>         if (!dtab->netdev_map)
>>                 goto free_percpu;
>>
>> +       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
>> +               dtab->dev_index_head = dev_map_create_hash();
>> +               if (!dtab->dev_index_head)
>> +                       goto free_map_area;
>> +
>> +               spin_lock_init(&dtab->index_lock);
>> +       }
>> +
>>         return 0;
>>
>> +free_map_area:
>> +       bpf_map_area_free(dtab->netdev_map);
>>  free_percpu:
>>         free_percpu(dtab->flush_list);
>>  free_charge:
>> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
>>
>>         free_percpu(dtab->flush_list);
>>         bpf_map_area_free(dtab->netdev_map);
>> +       kfree(dtab->dev_index_head);
>>         kfree(dtab);
>>  }
>>
>> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>>         return 0;
>>  }
>>
>> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
>> +                                                   int idx)
>> +{
>> +       return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
>> +}
>> +
>> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct hlist_head *head = dev_map_index_hash(dtab, key);
>> +       struct bpf_dtab_netdev *dev;
>> +
>> +       hlist_for_each_entry_rcu(dev, head, index_hlist)
>> +               if (dev->idx == key)
>> +                       return dev;
>> +
>> +       return NULL;
>> +}
>> +
>> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>> +                                   void *next_key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       u32 idx, *next = next_key;
>> +       struct bpf_dtab_netdev *dev, *next_dev;
>> +       struct hlist_head *head;
>> +       int i = 0;
>> +
>> +       if (!key)
>> +               goto find_first;
>> +
>> +       idx = *(u32 *)key;
>> +
>> +       dev = __dev_map_hash_lookup_elem(map, idx);
>> +       if (!dev)
>> +               goto find_first;
>> +
>> +       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
>> +                                   struct bpf_dtab_netdev, index_hlist);
>
> Just want to get a better understanding. Why do you want
> hlist_entry_safe instead of hlist_entry?

Erm, because the entry might not exist? The _safe variant just checks
the list pointer before casting to the containing struct.

> Also, maybe rcu_dereference instead of rcu_dereference_raw?

Well, the _raw() variant comes from the equivalent function in
hashtab.c, where I originally nicked most of this function, so think it
makes more sense to keep them the same.

> dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.
>
>> +
>> +       if (next_dev) {
>> +               *next = next_dev->idx;
>> +               return 0;
>> +       }
>> +
>> +       i = idx & (NETDEV_HASHENTRIES - 1);
>> +       i++;
>> +
>> + find_first:
>> +       for (; i < NETDEV_HASHENTRIES; i++) {
>> +               head = dev_map_index_hash(dtab, i);
>> +
>> +               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
>> +                                           struct bpf_dtab_netdev,
>> +                                           index_hlist);
>
> ditto. The same question as the above.
>
>> +               if (next_dev) {
>> +                       *next = next_dev->idx;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       return -ENOENT;
>> +}
>> +
>>  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
>>                        bool in_napi_ctx)
>>  {
>> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
>>         return dev ? &dev->ifindex : NULL;
>>  }
>>
>> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
>> +{
>> +       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
>> +                                                               *(u32 *)key);
>> +       struct net_device *dev = obj ? obj->dev : NULL;
>
> When obj->dev could be NULL?

Never. This could just as well be written as

return obj ? obj->dev->ifindex : NULL;

but writing it this way keeps it consistent with the existing
dev_map_lookup_elem() function.

>> +
>> +       return dev ? &dev->ifindex : NULL;
>> +}
>> +
>>  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>>  {
>>         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
>> @@ -423,6 +535,27 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
>>         return 0;
>>  }
>>
>> +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct bpf_dtab_netdev *old_dev;
>> +       int k = *(u32 *)key;
>> +
>> +       old_dev = __dev_map_hash_lookup_elem(map, k);
>> +       if (!old_dev)
>> +               return 0;
>> +
>> +       spin_lock(&dtab->index_lock);
>> +       if (!hlist_unhashed(&old_dev->index_hlist)) {
>> +               dtab->items--;
>> +               hlist_del_init_rcu(&old_dev->index_hlist);
>> +       }
>> +       spin_unlock(&dtab->index_lock);
>> +
>> +       call_rcu(&old_dev->rcu, __dev_map_entry_free);
>> +       return 0;
>> +}
>> +
>>  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
>>                                                     struct bpf_dtab *dtab,
>>                                                     u32 ifindex,
>> @@ -503,6 +636,55 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>>                                      map, key, value, map_flags);
>>  }
>>
>> +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
>> +                                    void *key, void *value, u64 map_flags)
>> +{
>> +       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> +       struct bpf_dtab_netdev *dev, *old_dev;
>> +       u32 ifindex = *(u32 *)value;
>> +       u32 idx = *(u32 *)key;
>> +
>> +       if (unlikely(map_flags > BPF_EXIST || !ifindex))
>> +               return -EINVAL;
>> +
>> +       old_dev = __dev_map_hash_lookup_elem(map, idx);
>> +       if (old_dev && (map_flags & BPF_NOEXIST))
>> +               return -EEXIST;
>> +
>> +       dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
>> +       if (IS_ERR(dev))
>> +               return PTR_ERR(dev);
>> +
>> +       spin_lock(&dtab->index_lock);
>> +
>> +       if (old_dev) {
>> +               hlist_del_rcu(&old_dev->index_hlist);
>> +       } else {
>> +               if (dtab->items >= dtab->map.max_entries) {
>> +                       spin_unlock(&dtab->index_lock);
>> +                       call_rcu(&dev->rcu, __dev_map_entry_free);
>> +                       return -ENOSPC;
>> +               }
>> +               dtab->items++;
>> +       }
>> +
>> +       hlist_add_head_rcu(&dev->index_hlist,
>> +                          dev_map_index_hash(dtab, idx));
>> +       spin_unlock(&dtab->index_lock);
>> +
>> +       if (old_dev)
>> +               call_rcu(&old_dev->rcu, __dev_map_entry_free);
>> +
>> +       return 0;
>> +}
>> +
>> +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
>> +                                  u64 map_flags)
>> +{
>> +       return __dev_map_hash_update_elem(current->nsproxy->net_ns,
>> +                                        map, key, value, map_flags);
>> +}
>> +
>>  const struct bpf_map_ops dev_map_ops = {
>>         .map_alloc = dev_map_alloc,
>>         .map_free = dev_map_free,
>> @@ -513,6 +695,16 @@ const struct bpf_map_ops dev_map_ops = {
>>         .map_check_btf = map_check_no_btf,
>>  };
>>
>> +const struct bpf_map_ops dev_map_hash_ops = {
>> +       .map_alloc = dev_map_alloc,
>> +       .map_free = dev_map_free,
>> +       .map_get_next_key = dev_map_hash_get_next_key,
>> +       .map_lookup_elem = dev_map_hash_lookup_elem,
>> +       .map_update_elem = dev_map_hash_update_elem,
>> +       .map_delete_elem = dev_map_hash_delete_elem,
>> +       .map_check_btf = map_check_no_btf,
>> +};
>> +
>>  static int dev_map_notification(struct notifier_block *notifier,
>>                                 ulong event, void *ptr)
>>  {
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index a2e763703c30..231b9e22827c 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -3460,6 +3460,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>>                         goto error;
>>                 break;
>>         case BPF_MAP_TYPE_DEVMAP:
>> +       case BPF_MAP_TYPE_DEVMAP_HASH:
>>                 if (func_id != BPF_FUNC_redirect_map &&
>>                     func_id != BPF_FUNC_map_lookup_elem)
>>                         goto error;
>> @@ -3542,6 +3543,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>>                 break;
>>         case BPF_FUNC_redirect_map:
>>                 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
>> +                   map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
>>                     map->map_type != BPF_MAP_TYPE_CPUMAP &&
>>                     map->map_type != BPF_MAP_TYPE_XSKMAP)
>>                         goto error;
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 089aaea0ccc6..276961c4e0d4 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>>         int err;
>>
>>         switch (map->map_type) {
>> -       case BPF_MAP_TYPE_DEVMAP: {
>> +       case BPF_MAP_TYPE_DEVMAP:
>> +       case BPF_MAP_TYPE_DEVMAP_HASH: {
>>                 struct bpf_dtab_netdev *dst = fwd;
>>
>>                 err = dev_map_enqueue(dst, xdp, dev_rx);
>> @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
>>         if (map) {
>>                 switch (map->map_type) {
>>                 case BPF_MAP_TYPE_DEVMAP:
>> +               case BPF_MAP_TYPE_DEVMAP_HASH:
>>                         __dev_map_flush(map);
>>                         break;
>>                 case BPF_MAP_TYPE_CPUMAP:
>> @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
>>         switch (map->map_type) {
>>         case BPF_MAP_TYPE_DEVMAP:
>>                 return __dev_map_lookup_elem(map, index);
>> +       case BPF_MAP_TYPE_DEVMAP_HASH:
>> +               return __dev_map_hash_lookup_elem(map, index);
>>         case BPF_MAP_TYPE_CPUMAP:
>>                 return __cpu_map_lookup_elem(map, index);
>>         case BPF_MAP_TYPE_XSKMAP:
>> @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
>>         ri->tgt_value = NULL;
>>         WRITE_ONCE(ri->map, NULL);
>>
>> -       if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
>> +       if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
>> +           map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
>>                 struct bpf_dtab_netdev *dst = fwd;
>>
>>                 err = dev_map_generic_redirect(dst, skb, xdp_prog);
>>