summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-06-25 11:19:24 +0200
committerChristian Brauner <brauner@kernel.org>2024-06-28 14:36:43 +0200
commita7ebb0fe43edfc869db3725a5d984de3e47c646c (patch)
tree4a0d756855ed36e8a0bf2bc0a4a3595ca063567d /fs/namespace.c
parentd04bccd8c19d601232ed3e3c9e248c0040167d47 (diff)
parentd896f71ce1f2e73813dc6f639eb0cf6f4beefdaa (diff)
Merge patch series "Support foreign mount namespace with statmount/listmount"
Josef Bacik <josef@toxicpanda.com> says: Currently the only way to iterate over mount entries in mount namespaces that aren't your own is to trawl through /proc in order to find /proc/$PID/mountinfo for the mount namespace that you want. This is hugely inefficient, so extend both statmount() and listmount() to allow specifying a mount namespace id in order to get to mounts in other mount namespaces. There are a few components to this 1. Having a global index of the mount namespace based on the ->seq value in the mount namespace. This gives us a unique identifier that isn't re-used. 2. Support looking up mount namespaces based on that unique identifier, and validating the user has permission to access the given mount namespace. 3. Provide a new ioctl() on nsfs in order to extract the unique identifier we can use for statmount() and listmount(). The code is relatively straightforward, and there is a selftest provided to validate everything works properly. This is based on vfs.all as of last week, so must be applied onto a tree that has Christians error handling rework in this area. If you wish you can pull the tree directly here https://github.com/josefbacik/linux/tree/listmount.combined Christian and I collaborated on this series, which is why there's patches from both of us in this series. Christian Brauner (4): fs: relax permissions for listmount() fs: relax permissions for statmount() fs: Allow listmount() in foreign mount namespace fs: Allow statmount() in foreign mount namespace Josef Bacik (4): fs: keep an index of current mount namespaces fs: export the mount ns id via statmount fs: add an ioctl to get the mnt ns id from nsfs selftests: add a test for the foreign mnt ns extensions fs/mount.h | 2 + fs/namespace.c | 240 ++++++++++-- fs/nsfs.c | 14 + include/uapi/linux/mount.h | 6 +- include/uapi/linux/nsfs.h | 2 + .../selftests/filesystems/statmount/Makefile | 2 +- .../filesystems/statmount/statmount.h | 46 +++ .../filesystems/statmount/statmount_test.c | 53 +-- .../filesystems/statmount/statmount_test_ns.c | 360 ++++++++++++++++++ 9 files changed, 659 insertions(+), 66 deletions(-) create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c Link: https://lore.kernel.org/r/cover.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c240
1 files changed, 216 insertions, 24 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 02a697287da5..e871f73c4c8c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
struct mount_kattr {
unsigned int attr_set;
@@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
+static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
+{
+ u64 seq_b = ns->seq;
+
+ if (seq < seq_b)
+ return -1;
+ if (seq > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
+}
+
+static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct mnt_namespace *ns_a = node_to_mnt_ns(a);
+ struct mnt_namespace *ns_b = node_to_mnt_ns(b);
+ u64 seq_a = ns_a->seq;
+
+ return mnt_ns_cmp(seq_a, ns_b) < 0;
+}
+
+static void mnt_ns_tree_add(struct mnt_namespace *ns)
+{
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+}
+
+static void mnt_ns_release(struct mnt_namespace *ns)
+{
+ lockdep_assert_not_held(&mnt_ns_tree_lock);
+
+ /* keep alive for {list,stat}mount() */
+ if (refcount_dec_and_test(&ns->passive)) {
+ put_user_ns(ns->user_ns);
+ kfree(ns);
+ }
+}
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+
+static void mnt_ns_tree_remove(struct mnt_namespace *ns)
+{
+ /* remove from global mount namespace list */
+ if (!is_anon_ns(ns)) {
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ }
+
+ mnt_ns_release(ns);
+}
+
+/*
+ * Returns the mount namespace which either has the specified id, or has the
+ * next smallest id afer the specified one.
+ */
+static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+{
+ struct rb_node *node = mnt_ns_tree.rb_node;
+ struct mnt_namespace *ret = NULL;
+
+ lockdep_assert_held(&mnt_ns_tree_lock);
+
+ while (node) {
+ struct mnt_namespace *n = node_to_mnt_ns(node);
+
+ if (mnt_ns_id <= n->seq) {
+ ret = node_to_mnt_ns(node);
+ if (mnt_ns_id == n->seq)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Lookup a mount namespace by id and take a passive reference count. Taking a
+ * passive reference means the mount namespace can be emptied if e.g., the last
+ * task holding an active reference exits. To access the mounts of the
+ * namespace the @namespace_sem must first be acquired. If the namespace has
+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
+ * see that the mount rbtree of the namespace is empty.
+ */
+static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
+{
+ struct mnt_namespace *ns;
+
+ guard(read_lock)(&mnt_ns_tree_lock);
+ ns = mnt_ns_find_id_at(mnt_ns_id);
+ if (!ns || ns->seq != mnt_ns_id)
+ return NULL;
+
+ refcount_inc(&ns->passive);
+ return ns;
+}
+
static inline void lock_mount_hash(void)
{
write_seqlock(&mount_lock);
@@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
if (!is_anon_ns(ns))
ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- kfree(ns);
+ mnt_ns_tree_remove(ns);
}
/*
@@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
+ refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
+ RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
+ mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
+{
+ s->sm.mask |= STATMOUNT_MNT_NS_ID;
+ s->sm.mnt_ns_id = ns->seq;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret;
@@ -4930,6 +5043,7 @@ static int copy_statmount_to_user(struct kstatmount *s)
static int do_statmount(struct kstatmount *s)
{
struct mount *m = real_mount(s->mnt);
+ struct mnt_namespace *ns = m->mnt_ns;
int err;
/*
@@ -4937,7 +5051,7 @@ static int do_statmount(struct kstatmount *s)
* mounts to show users.
*/
if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
err = security_sb_statfs(s->mnt->mnt_root);
@@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s)
if (!err && s->mask & STATMOUNT_MNT_POINT)
err = statmount_string(s, STATMOUNT_MNT_POINT);
+ if (!err && s->mask & STATMOUNT_MNT_NS_ID)
+ statmount_mnt_ns_id(s, ns);
+
if (err)
return err;
@@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
int ret;
size_t usize;
- BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
ret = get_user(usize, &req->size);
if (ret)
@@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
return 0;
}
+static struct mount *listmnt_next(struct mount *curr, bool reverse)
+{
+ struct rb_node *node;
+
+ if (reverse)
+ node = rb_prev(&curr->mnt_node);
+ else
+ node = rb_next(&curr->mnt_node);
+
+ return node_to_mount(node);
+}
+
+static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
+{
+ struct mount *first;
+
+ rwsem_assert_held(&namespace_sem);
+
+ /* We're looking at our own ns, just use get_fs_root. */
+ if (ns == current->nsproxy->mnt_ns) {
+ get_fs_root(current->fs, root);
+ return 0;
+ }
+
+ /*
+ * We have to find the first mount in our ns and use that, however it
+ * may not exist, so handle that properly.
+ */
+ if (RB_EMPTY_ROOT(&ns->mounts))
+ return -ENOENT;
+
+ first = listmnt_next(ns->root, false);
+ if (!first)
+ return -ENOENT;
+ root->mnt = mntget(&first->mnt);
+ root->dentry = dget(root->mnt->mnt_root);
+ return 0;
+}
+
+/*
+ * If the user requested a specific mount namespace id, look that up and return
+ * that, or if not simply grab a passive reference on our mount namespace and
+ * return that.
+ */
+static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+{
+ if (mnt_ns_id)
+ return lookup_mnt_ns(mnt_ns_id);
+ refcount_inc(&current->nsproxy->mnt_ns->passive);
+ return current->nsproxy->mnt_ns;
+}
+
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
struct statmount __user *, buf, size_t, bufsize,
unsigned int, flags)
{
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct vfsmount *mnt;
struct mnt_id_req kreq;
struct kstatmount ks;
@@ -5039,13 +5209,28 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
retry:
ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
if (ret)
return ret;
down_read(&namespace_sem);
- mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+ /* Has the namespace already been emptied? */
+ if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
+ up_read(&namespace_sem);
+ kvfree(ks.seq.buf);
+ return -ENOENT;
+ }
+
+ mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
if (!mnt) {
up_read(&namespace_sem);
kvfree(ks.seq.buf);
@@ -5053,7 +5238,12 @@ retry:
}
ks.mnt = mnt;
- get_fs_root(current->fs, &ks.root);
+ ret = grab_requested_root(ns, &ks.root);
+ if (ret) {
+ up_read(&namespace_sem);
+ kvfree(ks.seq.buf);
+ return ret;
+ }
ret = do_statmount(&ks);
path_put(&ks.root);
up_read(&namespace_sem);
@@ -5066,30 +5256,21 @@ retry:
return ret;
}
-static struct mount *listmnt_next(struct mount *curr, bool reverse)
-{
- struct rb_node *node;
-
- if (reverse)
- node = rb_prev(&curr->mnt_node);
- else
- node = rb_next(&curr->mnt_node);
-
- return node_to_mount(node);
-}
-
-static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
- size_t nr_mnt_ids, bool reverse)
+static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
+ u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
+ bool reverse)
{
struct path root __free(path_put) = {};
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct path orig;
struct mount *r, *first;
ssize_t ret;
rwsem_assert_held(&namespace_sem);
- get_fs_root(current->fs, &root);
+ ret = grab_requested_root(ns, &root);
+ if (ret)
+ return ret;
+
if (mnt_parent_id == LSMT_ROOT) {
orig = root;
} else {
@@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
* mounts to show users.
*/
if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
ret = security_sb_statfs(orig.dentry);
@@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
{
u64 *kmnt_ids __free(kvfree) = NULL;
const size_t maxcount = 1000000;
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
ssize_t ret;
@@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (!kmnt_ids)
return -ENOMEM;
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
scoped_guard(rwsem_read, &namespace_sem)
- ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
+ ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
@@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
+
+ mnt_ns_tree_add(ns);
}
void __init mnt_init(void)