[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAH3MdRU+FXDMdQr9Q-BP8eiMLwaV6Mn2oic3kXwK4wxvw+tJAQ@mail.gmail.com>
Date: Sat, 6 Jul 2019 12:14:05 -0700
From: Y Song <ys114321@...il.com>
To: Toke Høiland-Jørgensen <toke@...hat.com>
Cc: Daniel Borkmann <daniel@...earbox.net>,
Alexei Starovoitov <ast@...nel.org>,
netdev <netdev@...r.kernel.org>,
David Miller <davem@...emloft.net>,
Jesper Dangaard Brouer <brouer@...hat.com>,
Jakub Kicinski <jakub.kicinski@...ronome.com>,
Björn Töpel <bjorn.topel@...il.com>
Subject: Re: [PATCH bpf-next v2 4/6] xdp: Add devmap_hash map type for looking
up devices by hashed index
On Sat, Jul 6, 2019 at 1:48 AM Toke Høiland-Jørgensen <toke@...hat.com> wrote:
>
> From: Toke Høiland-Jørgensen <toke@...hat.com>
>
> A common pattern when using xdp_redirect_map() is to create a device map
> where the lookup key is simply ifindex. Because device maps are arrays,
> this leaves holes in the map, and the map has to be sized to fit the
> largest ifindex, regardless of how many devices actually are actually
> needed in the map.
>
> This patch adds a second type of device map where the key is looked up
> using a hashmap, instead of being used as an array index. This allows maps
> to be densely packed, so they can be smaller.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
> ---
> include/linux/bpf.h | 7 ++
> include/linux/bpf_types.h | 1
> include/trace/events/xdp.h | 3 -
> kernel/bpf/devmap.c | 192 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/bpf/verifier.c | 2
> net/core/filter.c | 9 ++
> 6 files changed, 211 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index bfdb54dd2ad1..f9a506147c8a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -713,6 +713,7 @@ struct xdp_buff;
> struct sk_buff;
>
> struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
> void __dev_map_flush(struct bpf_map *map);
> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> struct net_device *dev_rx);
> @@ -799,6 +800,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map,
> return NULL;
> }
>
> +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map,
> + u32 key)
> +{
> + return NULL;
> +}
> +
> static inline void __dev_map_flush(struct bpf_map *map)
> {
> }
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index eec5aeeeaf92..36a9c2325176 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
> #ifdef CONFIG_NET
> BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
> #if defined(CONFIG_BPF_STREAM_PARSER)
> BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index 68899fdc985b..8c8420230a10 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
> #endif /* __DEVMAP_OBJ_TYPE */
>
> #define devmap_ifindex(fwd, map) \
> - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
> + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \
> + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \
> ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
>
> #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index a2fe16362129..341af02f049d 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -37,6 +37,12 @@
> * notifier hook walks the map we know that new dev references can not be
> * added by the user because core infrastructure ensures dev_get_by_index()
> * calls will fail at this point.
> + *
> + * The devmap_hash type is a map type which interprets keys as ifindexes and
> + * indexes these using a hashmap. This allows maps that use ifindex as key to be
> + * densely packed instead of having holes in the lookup array for unused
> + * ifindexes. The setup and packet enqueue/send code is shared between the two
> + * types of devmap; only the lookup and insertion is different.
> */
> #include <linux/bpf.h>
> #include <net/xdp.h>
> @@ -59,6 +65,7 @@ struct xdp_bulk_queue {
>
> struct bpf_dtab_netdev {
> struct net_device *dev; /* must be first member, due to tracepoint */
> + struct hlist_node index_hlist;
> struct bpf_dtab *dtab;
> unsigned int idx; /* keep track of map index for tracepoint */
> struct xdp_bulk_queue __percpu *bulkq;
> @@ -70,11 +77,29 @@ struct bpf_dtab {
> struct bpf_dtab_netdev **netdev_map;
> struct list_head __percpu *flush_list;
> struct list_head list;
> +
> + /* these are only used for DEVMAP_HASH type maps */
> + unsigned int items;
> + struct hlist_head *dev_index_head;
> + spinlock_t index_lock;
> };
>
> static DEFINE_SPINLOCK(dev_map_lock);
> static LIST_HEAD(dev_map_list);
>
> +static struct hlist_head *dev_map_create_hash(void)
> +{
> + int i;
> + struct hlist_head *hash;
> +
> + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
> + if (hash != NULL)
> + for (i = 0; i < NETDEV_HASHENTRIES; i++)
> + INIT_HLIST_HEAD(&hash[i]);
> +
> + return hash;
> +}
> +
> static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> bool check_memlock)
> {
> @@ -98,6 +123,9 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
> cost += sizeof(struct list_head) * num_possible_cpus();
>
> + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
> + cost += sizeof(struct hlist_head) * NETDEV_HASHENTRIES;
> +
> /* if map size is larger than memlock limit, reject it */
> err = bpf_map_charge_init(&dtab->map.memory, cost);
> if (err)
> @@ -116,8 +144,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> if (!dtab->netdev_map)
> goto free_percpu;
>
> + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
> + dtab->dev_index_head = dev_map_create_hash();
> + if (!dtab->dev_index_head)
> + goto free_map_area;
> +
> + spin_lock_init(&dtab->index_lock);
> + }
> +
> return 0;
>
> +free_map_area:
> + bpf_map_area_free(dtab->netdev_map);
> free_percpu:
> free_percpu(dtab->flush_list);
> free_charge:
> @@ -199,6 +237,7 @@ static void dev_map_free(struct bpf_map *map)
>
> free_percpu(dtab->flush_list);
> bpf_map_area_free(dtab->netdev_map);
> + kfree(dtab->dev_index_head);
> kfree(dtab);
> }
>
> @@ -219,6 +258,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> return 0;
> }
>
> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
> + int idx)
> +{
> + return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
> +}
> +
> +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
> +{
> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> + struct hlist_head *head = dev_map_index_hash(dtab, key);
> + struct bpf_dtab_netdev *dev;
> +
> + hlist_for_each_entry_rcu(dev, head, index_hlist)
> + if (dev->idx == key)
> + return dev;
> +
> + return NULL;
> +}
> +
> +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
> + void *next_key)
> +{
> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> + u32 idx, *next = next_key;
> + struct bpf_dtab_netdev *dev, *next_dev;
> + struct hlist_head *head;
> + int i = 0;
> +
> + if (!key)
> + goto find_first;
> +
> + idx = *(u32 *)key;
> +
> + dev = __dev_map_hash_lookup_elem(map, idx);
> + if (!dev)
> + goto find_first;
> +
> + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
> + struct bpf_dtab_netdev, index_hlist);
Just want to get a better understanding. Why do you want
hlist_entry_safe instead of hlist_entry?
Also, maybe rcu_dereference instead of rcu_dereference_raw?
dev_map_hash_get_next_key() is called in syscall.c within rcu_read_lock region.
> +
> + if (next_dev) {
> + *next = next_dev->idx;
> + return 0;
> + }
> +
> + i = idx & (NETDEV_HASHENTRIES - 1);
> + i++;
> +
> + find_first:
> + for (; i < NETDEV_HASHENTRIES; i++) {
> + head = dev_map_index_hash(dtab, i);
> +
> + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
> + struct bpf_dtab_netdev,
> + index_hlist);
ditto. The same question as the above.
> + if (next_dev) {
> + *next = next_dev->idx;
> + return 0;
> + }
> + }
> +
> + return -ENOENT;
> +}
> +
> static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
> bool in_napi_ctx)
> {
> @@ -374,6 +477,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
> return dev ? &dev->ifindex : NULL;
> }
>
> +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
> + *(u32 *)key);
> + struct net_device *dev = obj ? obj->dev : NULL;
When obj->dev could be NULL?
> +
> + return dev ? &dev->ifindex : NULL;
> +}
> +
> static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
> {
> if (dev->dev->netdev_ops->ndo_xdp_xmit) {
> @@ -423,6 +535,27 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
> return 0;
> }
>
> +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
> +{
> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> + struct bpf_dtab_netdev *old_dev;
> + int k = *(u32 *)key;
> +
> + old_dev = __dev_map_hash_lookup_elem(map, k);
> + if (!old_dev)
> + return 0;
> +
> + spin_lock(&dtab->index_lock);
> + if (!hlist_unhashed(&old_dev->index_hlist)) {
> + dtab->items--;
> + hlist_del_init_rcu(&old_dev->index_hlist);
> + }
> + spin_unlock(&dtab->index_lock);
> +
> + call_rcu(&old_dev->rcu, __dev_map_entry_free);
> + return 0;
> +}
> +
> static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
> struct bpf_dtab *dtab,
> u32 ifindex,
> @@ -503,6 +636,55 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
> map, key, value, map_flags);
> }
>
> +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
> + void *key, void *value, u64 map_flags)
> +{
> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> + struct bpf_dtab_netdev *dev, *old_dev;
> + u32 ifindex = *(u32 *)value;
> + u32 idx = *(u32 *)key;
> +
> + if (unlikely(map_flags > BPF_EXIST || !ifindex))
> + return -EINVAL;
> +
> + old_dev = __dev_map_hash_lookup_elem(map, idx);
> + if (old_dev && (map_flags & BPF_NOEXIST))
> + return -EEXIST;
> +
> + dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
> + if (IS_ERR(dev))
> + return PTR_ERR(dev);
> +
> + spin_lock(&dtab->index_lock);
> +
> + if (old_dev) {
> + hlist_del_rcu(&old_dev->index_hlist);
> + } else {
> + if (dtab->items >= dtab->map.max_entries) {
> + spin_unlock(&dtab->index_lock);
> + call_rcu(&dev->rcu, __dev_map_entry_free);
> + return -ENOSPC;
> + }
> + dtab->items++;
> + }
> +
> + hlist_add_head_rcu(&dev->index_hlist,
> + dev_map_index_hash(dtab, idx));
> + spin_unlock(&dtab->index_lock);
> +
> + if (old_dev)
> + call_rcu(&old_dev->rcu, __dev_map_entry_free);
> +
> + return 0;
> +}
> +
> +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
> + u64 map_flags)
> +{
> + return __dev_map_hash_update_elem(current->nsproxy->net_ns,
> + map, key, value, map_flags);
> +}
> +
> const struct bpf_map_ops dev_map_ops = {
> .map_alloc = dev_map_alloc,
> .map_free = dev_map_free,
> @@ -513,6 +695,16 @@ const struct bpf_map_ops dev_map_ops = {
> .map_check_btf = map_check_no_btf,
> };
>
> +const struct bpf_map_ops dev_map_hash_ops = {
> + .map_alloc = dev_map_alloc,
> + .map_free = dev_map_free,
> + .map_get_next_key = dev_map_hash_get_next_key,
> + .map_lookup_elem = dev_map_hash_lookup_elem,
> + .map_update_elem = dev_map_hash_update_elem,
> + .map_delete_elem = dev_map_hash_delete_elem,
> + .map_check_btf = map_check_no_btf,
> +};
> +
> static int dev_map_notification(struct notifier_block *notifier,
> ulong event, void *ptr)
> {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index a2e763703c30..231b9e22827c 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3460,6 +3460,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> goto error;
> break;
> case BPF_MAP_TYPE_DEVMAP:
> + case BPF_MAP_TYPE_DEVMAP_HASH:
> if (func_id != BPF_FUNC_redirect_map &&
> func_id != BPF_FUNC_map_lookup_elem)
> goto error;
> @@ -3542,6 +3543,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> break;
> case BPF_FUNC_redirect_map:
> if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
> + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
> map->map_type != BPF_MAP_TYPE_CPUMAP &&
> map->map_type != BPF_MAP_TYPE_XSKMAP)
> goto error;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 089aaea0ccc6..276961c4e0d4 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
> int err;
>
> switch (map->map_type) {
> - case BPF_MAP_TYPE_DEVMAP: {
> + case BPF_MAP_TYPE_DEVMAP:
> + case BPF_MAP_TYPE_DEVMAP_HASH: {
> struct bpf_dtab_netdev *dst = fwd;
>
> err = dev_map_enqueue(dst, xdp, dev_rx);
> @@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
> if (map) {
> switch (map->map_type) {
> case BPF_MAP_TYPE_DEVMAP:
> + case BPF_MAP_TYPE_DEVMAP_HASH:
> __dev_map_flush(map);
> break;
> case BPF_MAP_TYPE_CPUMAP:
> @@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
> switch (map->map_type) {
> case BPF_MAP_TYPE_DEVMAP:
> return __dev_map_lookup_elem(map, index);
> + case BPF_MAP_TYPE_DEVMAP_HASH:
> + return __dev_map_hash_lookup_elem(map, index);
> case BPF_MAP_TYPE_CPUMAP:
> return __cpu_map_lookup_elem(map, index);
> case BPF_MAP_TYPE_XSKMAP:
> @@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
> ri->tgt_value = NULL;
> WRITE_ONCE(ri->map, NULL);
>
> - if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
> + if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
> + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
> struct bpf_dtab_netdev *dst = fwd;
>
> err = dev_map_generic_redirect(dst, skb, xdp_prog);
>
Powered by blists - more mailing lists