diff options
author | Christian Brauner <brauner@kernel.org> | 2024-06-25 11:19:24 +0200 |
---|---|---|
committer | Christian Brauner <brauner@kernel.org> | 2024-06-28 14:36:43 +0200 |
commit | a7ebb0fe43edfc869db3725a5d984de3e47c646c (patch) | |
tree | 4a0d756855ed36e8a0bf2bc0a4a3595ca063567d /fs/namespace.c | |
parent | d04bccd8c19d601232ed3e3c9e248c0040167d47 (diff) | |
parent | d896f71ce1f2e73813dc6f639eb0cf6f4beefdaa (diff) |
Merge patch series "Support foreign mount namespace with statmount/listmount"
Josef Bacik <josef@toxicpanda.com> says:
Currently the only way to iterate over mount entries in mount namespaces that
aren't your own is to trawl through /proc in order to find /proc/$PID/mountinfo
for the mount namespace that you want. This is hugely inefficient, so extend
both statmount() and listmount() to allow specifying a mount namespace id in
order to get to mounts in other mount namespaces.
There are a few components to this
1. Having a global index of the mount namespace based on the ->seq value in the
mount namespace. This gives us a unique identifier that isn't re-used.
2. Support looking up mount namespaces based on that unique identifier, and
validating the user has permission to access the given mount namespace.
3. Provide a new ioctl() on nsfs in order to extract the unique identifier we
can use for statmount() and listmount().
The code is relatively straightforward, and there is a selftest provided to
validate everything works properly.
This is based on vfs.all as of last week, so must be applied onto a tree that
has Christians error handling rework in this area. If you wish you can pull the
tree directly here
https://github.com/josefbacik/linux/tree/listmount.combined
Christian and I collaborated on this series, which is why there's patches from
both of us in this series.
Christian Brauner (4):
fs: relax permissions for listmount()
fs: relax permissions for statmount()
fs: Allow listmount() in foreign mount namespace
fs: Allow statmount() in foreign mount namespace
Josef Bacik (4):
fs: keep an index of current mount namespaces
fs: export the mount ns id via statmount
fs: add an ioctl to get the mnt ns id from nsfs
selftests: add a test for the foreign mnt ns extensions
fs/mount.h | 2 +
fs/namespace.c | 240 ++++++++++--
fs/nsfs.c | 14 +
include/uapi/linux/mount.h | 6 +-
include/uapi/linux/nsfs.h | 2 +
.../selftests/filesystems/statmount/Makefile | 2 +-
.../filesystems/statmount/statmount.h | 46 +++
.../filesystems/statmount/statmount_test.c | 53 +--
.../filesystems/statmount/statmount_test_ns.c | 360 ++++++++++++++++++
9 files changed, 659 insertions(+), 66 deletions(-)
create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h
create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
Link: https://lore.kernel.org/r/cover.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 240 |
1 files changed, 216 insertions, 24 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 02a697287da5..e871f73c4c8c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static DEFINE_RWLOCK(mnt_ns_tree_lock); +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) +{ + u64 seq_b = ns->seq; + + if (seq < seq_b) + return -1; + if (seq > seq_b) + return 1; + return 0; +} + +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); +} + +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +{ + struct mnt_namespace *ns_a = node_to_mnt_ns(a); + struct mnt_namespace *ns_b = node_to_mnt_ns(b); + u64 seq_a = ns_a->seq; + + return mnt_ns_cmp(seq_a, ns_b) < 0; +} + +static void mnt_ns_tree_add(struct mnt_namespace *ns) +{ + guard(write_lock)(&mnt_ns_tree_lock); + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); +} + +static void mnt_ns_release(struct mnt_namespace *ns) +{ + lockdep_assert_not_held(&mnt_ns_tree_lock); + + /* keep alive for {list,stat}mount() */ + if (refcount_dec_and_test(&ns->passive)) { + put_user_ns(ns->user_ns); + kfree(ns); + } +} +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) + +static void mnt_ns_tree_remove(struct mnt_namespace *ns) +{ + /* remove from global mount namespace list */ + if (!is_anon_ns(ns)) { + guard(write_lock)(&mnt_ns_tree_lock); + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + } + + mnt_ns_release(ns); +} + +/* + * Returns the mount namespace which either has the specified id, or has the + * next smallest id afer the specified one. + */ +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +{ + struct rb_node *node = mnt_ns_tree.rb_node; + struct mnt_namespace *ret = NULL; + + lockdep_assert_held(&mnt_ns_tree_lock); + + while (node) { + struct mnt_namespace *n = node_to_mnt_ns(node); + + if (mnt_ns_id <= n->seq) { + ret = node_to_mnt_ns(node); + if (mnt_ns_id == n->seq) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return ret; +} + +/* + * Lookup a mount namespace by id and take a passive reference count. Taking a + * passive reference means the mount namespace can be emptied if e.g., the last + * task holding an active reference exits. To access the mounts of the + * namespace the @namespace_sem must first be acquired. If the namespace has + * already shut down before acquiring @namespace_sem, {list,stat}mount() will + * see that the mount rbtree of the namespace is empty. + */ +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) +{ + struct mnt_namespace *ns; + + guard(read_lock)(&mnt_ns_tree_lock); + ns = mnt_ns_find_id_at(mnt_ns_id); + if (!ns || ns->seq != mnt_ns_id) + return NULL; + + refcount_inc(&ns->passive); + return ns; +} + static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); @@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - kfree(ns); + mnt_ns_tree_remove(ns); } /* @@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); + refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } + mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) +{ + s->sm.mask |= STATMOUNT_MNT_NS_ID; + s->sm.mnt_ns_id = ns->seq; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret; @@ -4930,6 +5043,7 @@ static int copy_statmount_to_user(struct kstatmount *s) static int do_statmount(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); + struct mnt_namespace *ns = m->mnt_ns; int err; /* @@ -4937,7 +5051,7 @@ static int do_statmount(struct kstatmount *s) * mounts to show users. */ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; err = security_sb_statfs(s->mnt->mnt_root); @@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s) if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) + statmount_mnt_ns_id(s, ns); + if (err) return err; @@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, int ret; size_t usize; - BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) @@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, return 0; } +static struct mount *listmnt_next(struct mount *curr, bool reverse) +{ + struct rb_node *node; + + if (reverse) + node = rb_prev(&curr->mnt_node); + else + node = rb_next(&curr->mnt_node); + + return node_to_mount(node); +} + +static int grab_requested_root(struct mnt_namespace *ns, struct path *root) +{ + struct mount *first; + + rwsem_assert_held(&namespace_sem); + + /* We're looking at our own ns, just use get_fs_root. */ + if (ns == current->nsproxy->mnt_ns) { + get_fs_root(current->fs, root); + return 0; + } + + /* + * We have to find the first mount in our ns and use that, however it + * may not exist, so handle that properly. + */ + if (RB_EMPTY_ROOT(&ns->mounts)) + return -ENOENT; + + first = listmnt_next(ns->root, false); + if (!first) + return -ENOENT; + root->mnt = mntget(&first->mnt); + root->dentry = dget(root->mnt->mnt_root); + return 0; +} + +/* + * If the user requested a specific mount namespace id, look that up and return + * that, or if not simply grab a passive reference on our mount namespace and + * return that. + */ +static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) +{ + if (mnt_ns_id) + return lookup_mnt_ns(mnt_ns_id); + refcount_inc(¤t->nsproxy->mnt_ns->passive); + return current->nsproxy->mnt_ns; +} + SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct vfsmount *mnt; struct mnt_id_req kreq; struct kstatmount ks; @@ -5039,13 +5209,28 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + retry: ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; down_read(&namespace_sem); - mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + /* Has the namespace already been emptied? */ + if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return -ENOENT; + } + + mnt = lookup_mnt_in_ns(kreq.mnt_id, ns); if (!mnt) { up_read(&namespace_sem); kvfree(ks.seq.buf); @@ -5053,7 +5238,12 @@ retry: } ks.mnt = mnt; - get_fs_root(current->fs, &ks.root); + ret = grab_requested_root(ns, &ks.root); + if (ret) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return ret; + } ret = do_statmount(&ks); path_put(&ks.root); up_read(&namespace_sem); @@ -5066,30 +5256,21 @@ retry: return ret; } -static struct mount *listmnt_next(struct mount *curr, bool reverse) -{ - struct rb_node *node; - - if (reverse) - node = rb_prev(&curr->mnt_node); - else - node = rb_next(&curr->mnt_node); - - return node_to_mount(node); -} - -static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, - size_t nr_mnt_ids, bool reverse) +static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, + u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, + bool reverse) { struct path root __free(path_put) = {}; - struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - get_fs_root(current->fs, &root); + ret = grab_requested_root(ns, &root); + if (ret) + return ret; + if (mnt_parent_id == LSMT_ROOT) { orig = root; } else { @@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, * mounts to show users. */ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; ret = security_sb_statfs(orig.dentry); @@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, { u64 *kmnt_ids __free(kvfree) = NULL; const size_t maxcount = 1000000; + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; ssize_t ret; @@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!kmnt_ids) return -ENOMEM; + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids, + ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) @@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); + + mnt_ns_tree_add(ns); } void __init mnt_init(void) |