[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180105060931.30815-9-jakub.kicinski@netronome.com>
Date: Thu, 4 Jan 2018 22:09:23 -0800
From: Jakub Kicinski <jakub.kicinski@...ronome.com>
To: netdev@...r.kernel.org, alexei.starovoitov@...il.com,
daniel@...earbox.net, davem@...emloft.net
Cc: oss-drivers@...ronome.com, tehnerd@...com,
Jakub Kicinski <jakub.kicinski@...ronome.com>
Subject: [PATCH bpf-next 08/16] bpf: offload: add map offload infrastructure
BPF map offload follow similar path to program offload. At creation
time users may specify ifindex of the device on which they want to
create the map. Map will be validated by the kernel's
.map_alloc_check callback and device driver will be called for the
actual allocation. Map will have an empty set of operations
associated with it (save for alloc and free callbacks). The real
device callbacks are kept in map->offload->dev_ops because they
have slightly different signatures. Map operations are called in
process context so the driver may communicate with HW freely,
msleep(), wait() etc.
Map alloc and free callbacks are muxed via existing .ndo_bpf, and
are always called with rtnl lock held. Maps and programs are
guaranteed to be destroyed before .ndo_uninit (i.e. before
unregister_netdev() returns). Map callbacks are invoked with
bpf_devs_lock *read* locked, drivers must take care of exclusive
locking if necessary.
All offload-specific branches are marked with unlikely() (through
bpf_map_is_dev_bound()), given that branch penalty will be
negligible compared to IO anyway, and we don't want to penalize
SW path unnecessarily.
Signed-off-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@...ronome.com>
---
include/linux/bpf.h | 59 +++++++++++++
include/linux/netdevice.h | 6 ++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/offload.c | 189 +++++++++++++++++++++++++++++++++++++++--
kernel/bpf/syscall.c | 42 +++++++--
kernel/bpf/verifier.c | 7 ++
tools/include/uapi/linux/bpf.h | 1 +
7 files changed, 293 insertions(+), 12 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 425ca3cd1e32..d9a0beb78825 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -65,6 +65,33 @@ struct bpf_map {
#endif
};
+struct bpf_offloaded_map;
+
+struct bpf_map_dev_ops {
+ int (*map_get_next_key)(struct bpf_offloaded_map *map,
+ void *key, void *next_key);
+ int (*map_lookup_elem)(struct bpf_offloaded_map *map,
+ void *key, void *value);
+ int (*map_update_elem)(struct bpf_offloaded_map *map,
+ void *key, void *value, u64 flags);
+ int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
+};
+
+struct bpf_offloaded_map {
+ struct bpf_map map;
+ struct net_device *netdev;
+ const struct bpf_map_dev_ops *dev_ops;
+ void *dev_priv;
+ struct list_head offloads;
+};
+
+static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_offloaded_map, map);
+}
+
+extern const struct bpf_map_ops bpf_map_offload_ops;
+
/* function argument constraints */
enum bpf_arg_type {
ARG_DONTCARE = 0, /* unused argument in helper function */
@@ -359,6 +386,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages);
void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
@@ -536,6 +564,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct bpf_prog *prog);
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags);
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
+int bpf_map_offload_get_next_key(struct bpf_map *map,
+ void *key, void *next_key);
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
+
#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
@@ -543,6 +580,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{
return aux->offload_requested;
}
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+ return unlikely(map->ops == &bpf_map_offload_ops);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
+void bpf_map_offload_map_free(struct bpf_map *map);
#else
static inline int bpf_prog_offload_init(struct bpf_prog *prog,
union bpf_attr *attr)
@@ -554,6 +599,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{
return false;
}
+
+static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+{
+ return false;
+}
+
+static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_map_offload_map_free(struct bpf_map *map)
+{
+}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49bfc6eec74c..ecbdb5d519d6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,6 +802,8 @@ enum bpf_netdev_command {
BPF_OFFLOAD_VERIFIER_PREP,
BPF_OFFLOAD_TRANSLATE,
BPF_OFFLOAD_DESTROY,
+ BPF_OFFLOAD_MAP_ALLOC,
+ BPF_OFFLOAD_MAP_FREE,
};
struct bpf_prog_offload_ops;
@@ -832,6 +834,10 @@ struct netdev_bpf {
struct {
struct bpf_prog *prog;
} offload;
+ /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
+ struct {
+ struct bpf_offloaded_map *offmap;
+ };
};
};
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36e2ad4..9910ab632652 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
+ __u32 map_ifindex; /* ifindex of netdev to create on */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index cdd1e19a668b..dddc172c8818 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,11 +24,13 @@
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
* RTNL lock cannot be taken when holding this lock.
*/
static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -250,11 +252,187 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_offloaded_map *offmap;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_type != BPF_MAP_TYPE_HASH)
+ return ERR_PTR(-EINVAL);
+
+ offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ if (!offmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&offmap->map, attr);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+ err = bpf_dev_offload_check(offmap->netdev);
+ if (err)
+ goto err_unlock;
+
+ err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return &offmap->map;
+
+err_unlock:
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+ kfree(offmap);
+ return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (offmap->netdev)
+ __bpf_map_offload_destroy(offmap);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+ flags);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_delete_elem(offmap, key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap;
+ struct bpf_prog_offload *offload;
+ bool ret;
+
+ if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map))
+ return false;
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ return true;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ offmap = map_to_offmap(map);
+
+ ret = offload && offload->netdev == offmap->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+ struct bpf_prog_offload *offload, *tmp;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+ struct bpf_offloaded_map *offmap, *tmp;
+
+ list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+ if (offmap->netdev == netdev)
+ __bpf_map_offload_destroy(offmap);
+}
+
static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct bpf_prog_offload *offload, *tmp;
ASSERT_RTNL();
@@ -265,11 +443,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
break;
down_write(&bpf_devs_lock);
- list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
- offloads) {
- if (offload->netdev == netdev)
- __bpf_prog_offload_destroy(offload->prog);
- }
+ bpf_offload_orphan_all_progs(netdev);
+ bpf_offload_orphan_all_maps(netdev);
up_write(&bpf_devs_lock);
break;
default:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 14adf29fc0f2..d5b773f04da1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr,
return 0;
}
+const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_alloc = bpf_map_offload_map_alloc,
+ .map_free = bpf_map_offload_map_free,
+};
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
if (err)
return ERR_PTR(err);
}
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
unsigned long flags;
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
+ map->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -676,6 +694,9 @@ static int map_update_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
+ } else if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+ goto out;
}
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ goto out;
+ }
+
preempt_disable();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock();
@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_get_next_key(map, key, next_key);
+ goto out;
+ }
+
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
+out:
if (err)
goto free_next_key;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2b211262c25..187f245a3338 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4795,6 +4795,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
}
+
+ if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ !bpf_offload_dev_match(prog, map)) {
+ verbose(env, "offload device mismatch between prog and map\n");
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e8c60acfa32..69f96af4a569 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
+ __u32 map_ifindex; /* ifindex of netdev to create on */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
--
2.15.1
Powered by blists - more mailing lists