[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <174265454834.356712.6297354306843654837.stgit@pro.pro>
Date: Sat, 22 Mar 2025 17:42:28 +0300
From: Kirill Tkhai <tkhai@...ru>
To: netdev@...r.kernel.org,
linux-kernel@...r.kernel.org
Cc: tkhai@...ru
Subject: [PATCH NET-PREV 37/51] net: Introduce delayed event work
Some drivers (e.g., failover and netvsc) use netdevice notifiers
to link devices each other by calling netdev_master_upper_dev_link().
Since we want 1)to make both of the devices using the same lock after
linking, and 2)to call netdevice notifiers with nd_lock is locked,
we can't do these two options at the same time, because there will
be a problem with priority inversion:
lock_netdev(dev1, &nd_lock1);
call_netdevice_notifier()
lock_netdev(dev2, &nd_lock2); <--- problem here if !locks_ordered()
nd_lock_transfer_devices(nd_lock, nd_lock2);
netdev_master_upper_dev_link(dev1, dev2);
We can't use double_lock_netdev() instead of lock_netdev() here,
since dev2 is unknown at that moment.
This patch introduces interface to allow handling events in delayed work.
It consists of three:
1)Delayed work to call event callback. The work starting without
any locks locked, so it can take locks of both devices in correct
order;
2)Completion to notify the task that delayed work is done;
3)task_work to allow task to wait for the completion in
the place where task has nd_lock unlocked.
Here is an example of what happens on module loading:
[Task] [Work]
insmod slave_netdev_drv.ko
enter to kernel
init_module()
...
...
lock_netdev()
call_netdevice_notifier()
schedule_delayed_event()
unlock_netdev()
delayed_event_work()
double_lock_netdev(dev1, &nd_lock1, dev2, &nd_lock2)
nd_lock_transfer_devices(nd_lock, nd_lock2)
netdev_master_upper_dev_link(dev1, dev2)
double_unlock_netdev(nd_lock1, nd_lock2)
complete()
wait_for_delayed_event_work()
wait_for_completion()
exit to userspace
As it's seen, using of task work allows to remain user-visible behavior here.
We return from syscall to userspace after delayed work is completed and
all events are handled. This is why we need this task work.
Signed-off-by: Kirill Tkhai <tkhai@...ru>
---
include/linux/netdevice.h | 2 +
net/core/dev.c | 95 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e9052e808a4..83b675ec2b0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2991,6 +2991,8 @@ netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info);
+int schedule_delayed_event(struct net_device *dev,
+ void (*func)(struct net_device *dev));
#define for_each_netdev(net, d) \
list_for_each_entry(d, &(net)->dev_base_head, dev_list)
diff --git a/net/core/dev.c b/net/core/dev.c
index e6809a80644e..1c447446215d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -154,6 +154,7 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <linux/task_work.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
@@ -2088,6 +2089,100 @@ static int call_netdevice_notifiers_mtu(unsigned long val,
return call_netdevice_notifiers_info(val, &info.info);
}
+struct event_info {
+ struct work_struct work;
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ void (*func)(struct net_device *slave_dev);
+
+ struct callback_head task_work;
+ struct completion comp;
+ refcount_t usage;
+};
+
+static void put_delayed_reg_info(struct event_info *info)
+{
+ if (refcount_dec_and_test(&info->usage))
+ kfree(info);
+}
+
+static void delayed_event_work(struct work_struct *work)
+{
+ struct event_info *info;
+ struct net_device *dev;
+
+ info = container_of(work, struct event_info, work);
+ dev = info->dev;
+
+ info->func(dev);
+
+ /* Not needed to own device during all @info life.
+ * Put device right after callback is handled,
+ * since a task submitted this work may wait for
+ * @dev counter.
+ */
+ netdev_put(dev, &info->dev_tracker);
+ info->dev = NULL;
+
+ complete(&info->comp);
+ put_delayed_reg_info(info);
+}
+
+static void wait_for_delayed_event_work(struct callback_head *task_work)
+{
+ struct event_info *info;
+
+ info = container_of(task_work, struct event_info, task_work);
+ wait_for_completion(&info->comp);
+
+ put_delayed_reg_info(info);
+}
+
+static struct event_info *alloc_delayed_event_info(struct net_device *dev,
+ void (*func)(struct net_device *dev))
+{
+ struct event_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return NULL;
+
+ INIT_WORK(&info->work, delayed_event_work);
+ init_task_work(&info->task_work, wait_for_delayed_event_work);
+ init_completion(&info->comp);
+ refcount_set(&info->usage, 1);
+ info->func = func;
+ info->dev = dev;
+ netdev_hold(dev, &info->dev_tracker, GFP_KERNEL);
+
+ return info;
+}
+
+int schedule_delayed_event(struct net_device *dev,
+ void (*func)(struct net_device *dev))
+{
+ struct event_info *info;
+
+ info = alloc_delayed_event_info(dev, func);
+ if (!info)
+ return NOTIFY_DONE;
+
+ /* In case of the notifier is called from regular task,
+ * make the task to wait for registration is completed
+ * before task is returned to userspace. E.g., a syscall
+ * caller will have failover already connected after
+ * he loaded slave device driver.
+ */
+ if (!(current->flags & PF_KTHREAD)) {
+ if (!task_work_add(current, &info->task_work, TWA_RESUME))
+ refcount_inc(&info->usage);
+ }
+
+ schedule_work(&info->work);
+ return NOTIFY_OK;
+}
+EXPORT_SYMBOL_GPL(schedule_delayed_event);
+
#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
Powered by blists - more mailing lists