[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231009182753.851551-5-toke@redhat.com>
Date: Mon, 9 Oct 2023 20:27:52 +0200
From: Toke Høiland-Jørgensen <toke@...hat.com>
To: David Ahern <dsahern@...il.com>,
Stephen Hemminger <stephen@...workplumber.org>
Cc: netdev@...r.kernel.org,
Toke Høiland-Jørgensen <toke@...hat.com>,
Nicolas Dichtel <nicolas.dichtel@...nd.com>,
Christian Brauner <brauner@...nel.org>,
"Eric W . Biederman" <ebiederm@...ssion.com>,
David Laight <David.Laight@...LAB.COM>
Subject: [RFC PATCH iproute2-next 4/5] ip: Also create and persist mount namespace when creating netns
When creating a new network namespace, persist not only the network namespace
reference itself, but also create and persist a new mount namespace that is
paired with the network namespace. This means that multiple subsequent
invocations of 'ip netns exec' will reuse the same mount namespace instead of
creating a new namespace on every entry, as was the behaviour before this patch.
The persistent mount namespace has the benefit that any new mounts created
inside the namespace will persist. Most notably, this is useful when using bpffs
instances along with 'ip netns', as these were previously transient to a single
'ip netns' invocation.
To preserve backwards compatibility, when changing namespaces we will fall back
to the old behaviour of creating a new mount namespace when switching netns, if
we can't find a persisted namespace to enter. This can happen if the netns
instance was created with a previous version of iproute2 that doesn't persist
the mount namespace.
One caveat of the mount namespace persistence is that we can't make the
containing directory mount shared, the way we do with the netns mounts. This
means that if 'ip netns del' is invoked *inside* a namespace created with 'ip
netns', the mount namespace reference will not be deleted and will stick around
in the original mount namespace where it was created. This is unavoidable
because it is not possible to create a bind-mounted reference to a mount
namespace inside that same mount namespace (as that would create a circular
reference).
In such a situation, we may end up with the network namespace reference being
removed but the mount namespace reference sticking around (the same thing can
happen if 'ip netns del' is executed with an older version of iproute2). In this
situation, a subsequent 'ip netns add' with the same namespace name will end up
reusing the old mount namespace reference.
Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
---
Makefile | 2 ++
ip/ipnetns.c | 64 +++++++++++++++++++++++++++++++++++++++++++------
lib/namespace.c | 8 ++++++-
3 files changed, 66 insertions(+), 8 deletions(-)
diff --git a/Makefile b/Makefile
index 5c559c8dc805..aeb1ddc53c6a 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ SBINDIR?=/sbin
CONF_ETC_DIR?=/etc/iproute2
CONF_USR_DIR?=$(LIBDIR)/iproute2
NETNS_RUN_DIR?=/var/run/netns
+MNTNS_RUN_DIR?=/var/run/netns-mnt
NETNS_ETC_DIR?=/etc/netns
DATADIR?=$(PREFIX)/share
HDRDIR?=$(PREFIX)/include/iproute2
@@ -41,6 +42,7 @@ endif
DEFINES+=-DCONF_USR_DIR=\"$(CONF_USR_DIR)\" \
-DCONF_ETC_DIR=\"$(CONF_ETC_DIR)\" \
-DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \
+ -DMNTNS_RUN_DIR=\"$(MNTNS_RUN_DIR)\" \
-DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" \
-DCONF_COLOR=$(CONF_COLOR)
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 529790482683..551819577755 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -733,13 +733,24 @@ static int netns_identify(int argc, char **argv)
static int on_netns_del(char *nsname, void *arg)
{
- char netns_path[PATH_MAX];
+ char ns_path[PATH_MAX];
+ struct stat st;
+
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", MNTNS_RUN_DIR, nsname);
+ if (!stat(ns_path, &st)) { /* may not exist if created by old iproute2 */
+ umount2(ns_path, MNT_DETACH);
+ if (unlink(ns_path) < 0) {
+ fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
+ ns_path, strerror(errno));
+ return -1;
+ }
+ }
- snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, nsname);
- umount2(netns_path, MNT_DETACH);
- if (unlink(netns_path) < 0) {
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", NETNS_RUN_DIR, nsname);
+ umount2(ns_path, MNT_DETACH);
+ if (unlink(ns_path) < 0) {
fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
- netns_path, strerror(errno));
+ ns_path, strerror(errno));
return -1;
}
return 0;
@@ -885,17 +896,46 @@ static int bind_ns_file(const char *parent, const char *nsfile,
return 0;
}
+static ino_t get_mnt_ino(pid_t pid)
+{
+ char path[PATH_MAX];
+ struct stat st;
+
+ snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
+
+ if (stat(path, &st) != 0) {
+ fprintf(stderr, "stat of %s failed: %s\n",
+ path, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return st.st_ino;
+}
+
static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
int *fd)
{
+ ino_t mnt_ino;
pid_t child;
+ mnt_ino = get_mnt_ino(getpid());
+
child = fork_and_wait(fd);
if (child)
return child;
if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
exit(EXIT_FAILURE);
+
+ /* We can only bind the mount namespace reference if the target pid is
+ * actually in a different mount namespace than ourselves. We ignore any
+ * errors in creating the mount namespace reference because an old
+ * namespace mount may be present if a network namespace with the same
+ * name was previously removed by an older version of iproute2; in this
+ * case that old reference will just be reused.
+ */
+ if (mnt_ino != get_mnt_ino(target_pid))
+ bind_ns_file(MNTNS_RUN_DIR, "mnt", ns_name, target_pid);
+
exit(EXIT_SUCCESS);
}
@@ -1003,8 +1043,13 @@ static int netns_add(int argc, char **argv, bool create)
* unmounting a network namespace file in one namespace will unmount the
* network namespace file in all namespaces allowing the network
* namespace to be freed sooner.
+ *
+ * The mount namespace directory cannot be shared because it's not
+ * possible to mount references to a mount namespace inside that
+ * namespace itself.
*/
- if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED))
+ if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED) ||
+ prepare_ns_mount_dir(MNTNS_RUN_DIR, MS_SLAVE))
return -1;
child = bind_ns_files_from_child(name, pid, &event_fd);
@@ -1012,12 +1057,17 @@ static int netns_add(int argc, char **argv, bool create)
exit(EXIT_FAILURE);
if (create) {
- if (unshare(CLONE_NEWNET) < 0) {
+ if (unshare(CLONE_NEWNET | CLONE_NEWNS) < 0) {
fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
name, strerror(errno));
close(event_fd);
exit(EXIT_FAILURE);
}
+
+ if (prepare_mountns(name, false)) {
+ close(event_fd);
+ exit(EXIT_FAILURE);
+ }
}
return sync_with_child(child, event_fd);
diff --git a/lib/namespace.c b/lib/namespace.c
index 5e310762f34b..5f2449fb0003 100644
--- a/lib/namespace.c
+++ b/lib/namespace.c
@@ -127,7 +127,13 @@ int netns_switch(char *name)
if (switch_ns(NETNS_RUN_DIR, name, CLONE_NEWNET))
return -1;
- return prepare_mountns(name, true);
+ /* Try to enter an existing persisted mount namespace. If this fails,
+ * preserve the old behaviour of creating a new namespace on entry.
+ */
+ if (switch_ns(MNTNS_RUN_DIR, name, CLONE_NEWNS))
+ return prepare_mountns(name, true);
+
+ return 0;
}
int netns_get_fd(const char *name)
--
2.42.0
Powered by blists - more mailing lists