diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-28 10:49:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-28 10:49:38 -0700 |
commit | 794cbac9c053155754d04231b9365f91ea4ce7d2 (patch) | |
tree | 2ce6a817a77146b52c8937426cbcbd023a401174 | |
parent | 953e117bf4aad7e1d01419d4bcc03ab93420387c (diff) | |
parent | a7cce099450f8fc597a6ac215440666610895fb7 (diff) |
Merge tag 'pull-mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs mount updates from Al Viro:
- mount hash conflicts rudiments are gone now - we do not allow
multiple mounts with the same parent/mountpoint to be hashed at the
same time.
- 'struct mount' changes:
- mnt_umounting is gone
- mnt_slave_list/mnt_slave is an hlist now
- overmounts are kept track of by explicit pointer in mount
- a bunch of flags moved out of mnt_flags to a new field, with
only namespace_sem for protection
- mnt_expiry is protected by mount_lock now (instead of
namespace_sem)
- MNT_LOCKED is used only for mounts that need to remain attached
to their parents to prevent mountpoint exposure - no more
overloading it for absolute roots
- all mnt_list uses are transient now - it's used only to
represent temporary sets during umount_tree()
- mount refcounting change: children no longer pin parents for any
mounts, whether they'd passed through umount_tree() or not
- 'struct mountpoint' changes:
- refcount is no more; what matters is ->m_list emptiness
- instead of temporary bumping the refcount, we insert a new
object (pinned_mountpoint) into ->m_list
- new calling conventions for lock_mount() and friends
- do_move_mount()/attach_recursive_mnt() seriously cleaned up
- globals in fs/pnode.c are gone
- propagate_mnt(), change_mnt_propagation() and propagate_umount()
cleaned up (in the last case - pretty much completely rewritten).
- freeing of emptied mnt_namespace is done in namespace_unlock(). For
one thing, there are subtle ordering requirements there; for another
it simplifies cleanups.
- assorted cleanups
- restore the machinery for long-term mounts from accumulated bitrot.
This is going to get a followup come next cycle, when the change of
vfs_fs_parse_string() calling conventions goes into -next
* tag 'pull-mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (48 commits)
statmount_mnt_basic(): simplify the logics for group id
invent_group_ids(): zero ->mnt_group_id always implies !IS_MNT_SHARED()
get rid of CL_SHARE_TO_SLAVE
take freeing of emptied mnt_namespace to namespace_unlock()
copy_tree(): don't link the mounts via mnt_list
change_mnt_propagation(): move ->mnt_master assignment into MS_SLAVE case
mnt_slave_list/mnt_slave: turn into hlist_head/hlist_node
turn do_make_slave() into transfer_propagation()
do_make_slave(): choose new master sanely
change_mnt_propagation(): do_make_slave() is a no-op unless IS_MNT_SHARED()
change_mnt_propagation() cleanups, step 1
propagate_mnt(): fix comment and convert to kernel-doc, while we are at it
propagate_mnt(): get rid of last_dest
fs/pnode.c: get rid of globals
propagate_one(): fold into the sole caller
propagate_one(): separate the "what should be the master for this copy" part
propagate_one(): separate the "do we need secondary here?" logics
propagate_mnt(): handle all peer groups in the same loop
propagate_one(): get rid of dest_master
mount: separate the flags accessed only under namespace_sem
...
-rw-r--r-- | Documentation/filesystems/propagate_umount.txt | 484 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gemfs.c | 21 | ||||
-rw-r--r-- | drivers/gpu/drm/v3d/v3d_gemfs.c | 21 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 2 | ||||
-rw-r--r-- | fs/mount.h | 40 | ||||
-rw-r--r-- | fs/namespace.c | 711 | ||||
-rw-r--r-- | fs/pnode.c | 697 | ||||
-rw-r--r-- | fs/pnode.h | 27 | ||||
-rw-r--r-- | include/linux/mount.h | 18 | ||||
-rw-r--r-- | ipc/mqueue.c | 2 |
10 files changed, 1216 insertions, 807 deletions
diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt new file mode 100644 index 0000000000000..c90349e5b889f --- /dev/null +++ b/Documentation/filesystems/propagate_umount.txt @@ -0,0 +1,484 @@ + Notes on propagate_umount() + +Umount propagation starts with a set of mounts we are already going to +take out. Ideally, we would like to add all downstream cognates to +that set - anything with the same mountpoint as one of the removed +mounts and with parent that would receive events from the parent of that +mount. However, there are some constraints the resulting set must +satisfy. + +It is convenient to define several properties of sets of mounts: + +1) A set S of mounts is non-shifting if for any mount X belonging +to S all subtrees mounted strictly inside of X (i.e. not overmounting +the root of X) contain only elements of S. + +2) A set S is non-revealing if all locked mounts that belong to S have +parents that also belong to S. + +3) A set S is closed if it contains all children of its elements. + +The set of mounts taken out by umount(2) must be non-shifting and +non-revealing; the first constraint is what allows to reparent +any remaining mounts and the second is what prevents the exposure +of any concealed mountpoints. + +propagate_umount() takes the original set as an argument and tries to +extend that set. The original set is a full subtree and its root is +unlocked; what matters is that it's closed and non-revealing. +Resulting set may not be closed; there might still be mounts outside +of that set, but only on top of stacks of root-overmounting elements +of set. They can be reparented to the place where the bottom of +stack is attached to a mount that will survive. NOTE: doing that +will violate a constraint on having no more than one mount with +the same parent/mountpoint pair; however, the caller (umount_tree()) +will immediately remedy that - it may keep unmounted element attached +to parent, but only if the parent itself is unmounted. Since all +conflicts created by reparenting have common parent *not* in the +set and one side of the conflict (bottom of the stack of overmounts) +is in the set, it will be resolved. However, we rely upon umount_tree() +doing that pretty much immediately after the call of propagate_umount(). + +Algorithm is based on two statements: + 1) for any set S, there is a maximal non-shifting subset of S +and it can be calculated in O(#S) time. + 2) for any non-shifting set S, there is a maximal non-revealing +subset of S. That subset is also non-shifting and it can be calculated +in O(#S) time. + + Finding candidates. + +We are given a closed set U and we want to find all mounts that have +the same mountpoint as some mount m in U *and* whose parent receives +propagation from the parent of the same mount m. Naive implementation +would be + S = {} + for each m in U + add m to S + p = parent(m) + for each q in Propagation(p) - {p} + child = look_up(q, mountpoint(m)) + if child + add child to S +but that can lead to excessive work - there might be propagation among the +subtrees of U, in which case we'd end up examining the same candidates +many times. Since propagation is transitive, the same will happen to +everything downstream of that candidate and it's not hard to construct +cases where the approach above leads to the time quadratic by the actual +number of candidates. + +Note that if we run into a candidate we'd already seen, it must've been +added on an earlier iteration of the outer loop - all additions made +during one iteration of the outer loop have different parents. So +if we find a child already added to the set, we know that everything +in Propagation(parent(child)) with the same mountpoint has been already +added. + S = {} + for each m in U + if m in S + continue + add m to S + p = parent(m) + q = propagation_next(p, p) + while q + child = look_up(q, mountpoint(m)) + if child + if child in S + q = skip_them(q, p) + continue; + add child to S + q = propagation_next(q, p) +where +skip_them(q, p) + keep walking Propagation(p) from q until we find something + not in Propagation(q) + +would get rid of that problem, but we need a sane implementation of +skip_them(). That's not hard to do - split propagation_next() into +"down into mnt_slave_list" and "forward-and-up" parts, with the +skip_them() being "repeat the forward-and-up part until we get NULL +or something that isn't a peer of the one we are skipping". + +Note that there can be no absolute roots among the extra candidates - +they all come from mount lookups. Absolute root among the original +set is _currently_ impossible, but it might be worth protecting +against. + + Maximal non-shifting subsets. + +Let's call a mount m in a set S forbidden in that set if there is a +subtree mounted strictly inside m and containing mounts that do not +belong to S. + +The set is non-shifting when none of its elements are forbidden in it. + +If mount m is forbidden in a set S, it is forbidden in any subset S' it +belongs to. In other words, it can't belong to any of the non-shifting +subsets of S. If we had a way to find a forbidden mount or show that +there's none, we could use it to find the maximal non-shifting subset +simply by finding and removing them until none remain. + +Suppose mount m is forbidden in S; then any mounts forbidden in S - {m} +must have been forbidden in S itself. Indeed, since m has descendents +that do not belong to S, any subtree that fits into S will fit into +S - {m} as well. + +So in principle we could go through elements of S, checking if they +are forbidden in S and removing the ones that are. Removals will +not invalidate the checks done for earlier mounts - if they were not +forbidden at the time we checked, they won't become forbidden later. +It's too costly to be practical, but there is a similar approach that +is linear by size of S. + +Let's say that mount x in a set S is forbidden by mount y, if + * both x and y belong to S. + * there is a chain of mounts starting at x and leaving S + immediately after passing through y, with the first + mountpoint strictly inside x. +Note 1: x may be equal to y - that's the case when something not +belonging to S is mounted strictly inside x. +Note 2: if y does not belong to S, it can't forbid anything in S. +Note 3: if y has no children outside of S, it can't forbid anything in S. + +It's easy to show that mount x is forbidden in S if and only if x is +forbidden in S by some mount y. And it's easy to find all mounts in S +forbidden by a given mount. + +Consider the following operation: + Trim(S, m) = S - {x : x is forbidden by m in S} + +Note that if m does not belong to S or has no children outside of S we +are guaranteed that Trim(S, m) is equal to S. + +The following is true: if x is forbidden by y in Trim(S, m), it was +already forbidden by y in S. + +Proof: Suppose x is forbidden by y in Trim(S, m). Then there is a +chain of mounts (x_0 = x, ..., x_k = y, x_{k+1} = r), such that x_{k+1} +is the first element that doesn't belong to Trim(S, m) and the +mountpoint of x_1 is strictly inside x. If mount r belongs to S, it must +have been removed by Trim(S, m), i.e. it was forbidden in S by m. +Then there was a mount chain from r to some child of m that stayed in +S all the way until m, but that's impossible since x belongs to Trim(S, m) +and prepending (x_0, ..., x_k) to that chain demonstrates that x is also +forbidden in S by m, and thus can't belong to Trim(S, m). +Therefore r can not belong to S and our chain demonstrates that +x is forbidden by y in S. QED. + +Corollary: no mount is forbidden by m in Trim(S, m). Indeed, any +such mount would have been forbidden by m in S and thus would have been +in the part of S removed in Trim(S, m). + +Corollary: no mount is forbidden by m in Trim(Trim(S, m), n). Indeed, +any such would have to have been forbidden by m in Trim(S, m), which +is impossible. + +Corollary: after + S = Trim(S, x_1) + S = Trim(S, x_2) + ... + S = Trim(S, x_k) +no mount remaining in S will be forbidden by either of x_1,...,x_k. + +The following will reduce S to its maximal non-shifting subset: + visited = {} + while S contains elements not belonging to visited + let m be an arbitrary such element of S + S = Trim(S, m) + add m to visited + +S never grows, so the number of elements of S not belonging to visited +decreases at least by one on each iteration. When the loop terminates, +all mounts remaining in S belong to visited. It's easy to see that at +the beginning of each iteration no mount remaining in S will be forbidden +by any element of visited. In other words, no mount remaining in S will +be forbidden, i.e. final value of S will be non-shifting. It will be +the maximal non-shifting subset, since we were removing only forbidden +elements. + + There are two difficulties in implementing the above in linear +time, both due to the fact that Trim() might need to remove more than one +element. Naive implementation of Trim() is vulnerable to running into a +long chain of mounts, each mounted on top of parent's root. Nothing in +that chain is forbidden, so nothing gets removed from it. We need to +recognize such chains and avoid walking them again on subsequent calls of +Trim(), otherwise we will end up with worst-case time being quadratic by +the number of elements in S. Another difficulty is in implementing the +outer loop - we need to iterate through all elements of a shrinking set. +That would be trivial if we never removed more than one element at a time +(linked list, with list_for_each_entry_safe for iterator), but we may +need to remove more than one entry, possibly including the ones we have +already visited. + + Let's start with naive algorithm for Trim(): + +Trim_one(m) + found = false + for each n in children(m) + if n not in S + found = true + if (mountpoint(n) != root(m)) + remove m from S + break + if found + Trim_ancestors(m) + +Trim_ancestors(m) + for (; parent(m) in S; m = parent(m)) { + if (mountpoint(m) != root(parent(m))) + remove parent(m) from S + } + +If m belongs to S, Trim_one(m) will replace S with Trim(S, m). +Proof: + Consider the chains excluding elements from Trim(S, m). The last +two elements in such chain are m and some child of m that does not belong +to S. If m has no such children, Trim(S, m) is equal to S. + m itself is removed if and only if the chain has exactly two +elements, i.e. when the last element does not overmount the root of m. +In other words, that happens when m has a child not in S that does not +overmount the root of m. + All other elements to remove will be ancestors of m, such that +the entire descent chain from them to m is contained in S. Let +(x_0, x_1, ..., x_k = m) be the longest such chain. x_i needs to be +removed if and only if x_{i+1} does not overmount its root. It's easy +to see that Trim_ancestors(m) will iterate through that chain from +x_k to x_1 and that it will remove exactly the elements that need to be +removed. + + Note that if the loop in Trim_ancestors() walks into an already +visited element, we are guaranteed that remaining iterations will see +only elements that had already been visited and remove none of them. +That's the weakness that makes it vulnerable to long chains of full +overmounts. + + It's easy to deal with, if we can afford setting marks on +elements of S; we would mark all elements already visited by +Trim_ancestors() and have it bail out as soon as it sees an already +marked element. + + The problems with iterating through the set can be dealt with in +several ways, depending upon the representation we choose for our set. +One useful observation is that we are given a closed subset in S - the +original set passed to propagate_umount(). Its elements can neither +forbid anything nor be forbidden by anything - all their descendents +belong to S, so they can not occur anywhere in any excluding chain. +In other words, the elements of that subset will remain in S until +the end and Trim_one(S, m) is a no-op for all m from that subset. + + That suggests keeping S as a disjoint union of a closed set U +('will be unmounted, no matter what') and the set of all elements of +S that do not belong to U. That set ('candidates') is all we need +to iterate through. Let's represent it as a subset in a cyclic list, +consisting of all list elements that are marked as candidates (initially - +all of them). Then we could have Trim_ancestors() only remove the mark, +leaving the elements on the list. Then Trim_one() would never remove +anything other than its argument from the containing list, allowing to +use list_for_each_entry_safe() as iterator. + + Assuming that representation we get the following: + + list_for_each_entry_safe(m, ..., Candidates, ...) + Trim_one(m) +where +Trim_one(m) + if (m is not marked as a candidate) + strip the "seen by Trim_ancestors" mark from m + remove m from the Candidates list + return + + remove_this = false + found = false + for each n in children(m) + if n not in S + found = true + if (mountpoint(n) != root(m)) + remove_this = true + break + if found + Trim_ancestors(m) + if remove_this + strip the "seen by Trim_ancestors" mark from m + strip the "candidate" mark from m + remove m from the Candidate list + +Trim_ancestors(m) + for (p = parent(m); p is marked as candidate ; m = p, p = parent(p)) { + if m is marked as seen by Trim_ancestors + return + mark m as seen by Trim_ancestors + if (mountpoint(m) != root(p)) + strip the "candidate" mark from p + } + + Terminating condition in the loop in Trim_ancestors() is correct, +since that that loop will never run into p belonging to U - p is always +an ancestor of argument of Trim_one() and since U is closed, the argument +of Trim_one() would also have to belong to U. But Trim_one() is never +called for elements of U. In other words, p belongs to S if and only +if it belongs to candidates. + + Time complexity: +* we get no more than O(#S) calls of Trim_one() +* the loop over children in Trim_one() never looks at the same child +twice through all the calls. +* iterations of that loop for children in S are no more than O(#S) +in the worst case +* at most two children that are not elements of S are considered per +call of Trim_one(). +* the loop in Trim_ancestors() sets its mark once per iteration and +no element of S has is set more than once. + + In the end we may have some elements excluded from S by +Trim_ancestors() still stuck on the list. We could do a separate +loop removing them from the list (also no worse than O(#S) time), +but it's easier to leave that until the next phase - there we will +iterate through the candidates anyway. + + The caller has already removed all elements of U from their parents' +lists of children, which means that checking if child belongs to S is +equivalent to checking if it's marked as a candidate; we'll never see +the elements of U in the loop over children in Trim_one(). + + What's more, if we see that children(m) is empty and m is not +locked, we can immediately move m into the committed subset (remove +from the parent's list of children, etc.). That's one fewer mount we'll +have to look into when we check the list of children of its parent *and* +when we get to building the non-revealing subset. + + Maximal non-revealing subsets + +If S is not a non-revealing subset, there is a locked element x in S +such that parent of x is not in S. + +Obviously, no non-revealing subset of S may contain x. Removing such +elements one by one will obviously end with the maximal non-revealing +subset (possibly empty one). Note that removal of an element will +require removal of all its locked children, etc. + +If the set had been non-shifting, it will remain non-shifting after +such removals. +Proof: suppose S was non-shifting, x is a locked element of S, parent of x +is not in S and S - {x} is not non-shifting. Then there is an element m +in S - {x} and a subtree mounted strictly inside m, such that m contains +an element not in in S - {x}. Since S is non-shifting, everything in +that subtree must belong to S. But that means that this subtree must +contain x somewhere *and* that parent of x either belongs that subtree +or is equal to m. Either way it must belong to S. Contradiction. + +// same representation as for finding maximal non-shifting subsets: +// S is a disjoint union of a non-revealing set U (the ones we are committed +// to unmount) and a set of candidates, represented as a subset of list +// elements that have "is a candidate" mark on them. +// Elements of U are removed from their parents' lists of children. +// In the end candidates becomes empty and maximal non-revealing non-shifting +// subset of S is now in U + while (Candidates list is non-empty) + handle_locked(first(Candidates)) + +handle_locked(m) + if m is not marked as a candidate + strip the "seen by Trim_ancestors" mark from m + remove m from the list + return + cutoff = m + for (p = m; p in candidates; p = parent(p)) { + strip the "seen by Trim_ancestors" mark from p + strip the "candidate" mark from p + remove p from the Candidates list + if (!locked(p)) + cutoff = parent(p) + } + if p in U + cutoff = p + while m != cutoff + remove m from children(parent(m)) + add m to U + m = parent(m) + +Let (x_0, ..., x_n = m) be the maximal chain of descent of m within S. +* If it contains some elements of U, let x_k be the last one of those. +Then union of U with {x_{k+1}, ..., x_n} is obviously non-revealing. +* otherwise if all its elements are locked, then none of {x_0, ..., x_n} +may be elements of a non-revealing subset of S. +* otherwise let x_k be the first unlocked element of the chain. Then none +of {x_0, ..., x_{k-1}} may be an element of a non-revealing subset of +S and union of U and {x_k, ..., x_n} is non-revealing. + +handle_locked(m) finds which of these cases applies and adjusts Candidates +and U accordingly. U remains non-revealing, union of Candidates and +U still contains any non-revealing subset of S and after the call of +handle_locked(m) m is guaranteed to be not in Candidates list. So having +it called for each element of S would suffice to empty Candidates, +leaving U the maximal non-revealing subset of S. + +However, handle_locked(m) is a no-op when m belongs to U, so it's enough +to have it called for elements of Candidates list until none remain. + +Time complexity: number of calls of handle_locked() is limited by +#Candidates, each iteration of the first loop in handle_locked() removes +an element from the list, so their total number of executions is also +limited by #Candidates; number of iterations in the second loop is no +greater than the number of iterations of the first loop. + + + Reparenting + +After we'd calculated the final set, we still need to deal with +reparenting - if an element of the final set has a child not in it, +we need to reparent such child. + +Such children can only be root-overmounting (otherwise the set wouldn't +be non-shifting) and their parents can not belong to the original set, +since the original is guaranteed to be closed. + + + Putting all of that together + +The plan is to + * find all candidates + * trim down to maximal non-shifting subset + * trim down to maximal non-revealing subset + * reparent anything that needs to be reparented + * return the resulting set to the caller + +For the 2nd and 3rd steps we want to separate the set into growing +non-revealing subset, initially containing the original set ("U" in +terms of the pseudocode above) and everything we are still not sure about +("candidates"). It means that for the output of the 1st step we'd like +the extra candidates separated from the stuff already in the original set. +For the 4th step we would like the additions to U separate from the +original set. + +So let's go for + * original set ("set"). Linkage via mnt_list + * undecided candidates ("candidates"). Subset of a list, +consisting of all its elements marked with a new flag (T_UMOUNT_CANDIDATE). +Initially all elements of the list will be marked that way; in the +end the list will become empty and no mounts will remain marked with +that flag. + * Reuse T_MARKED for "has been already seen by trim_ancestors()". + * anything in U that hadn't been in the original set - elements of +candidates will gradually be either discarded or moved there. In other +words, it's the candidates we have already decided to unmount. Its role +is reasonably close to the old "to_umount", so let's use that name. +Linkage via mnt_list. + +For gather_candidates() we'll need to maintain both candidates (S - +set) and intersection of S with set. Use T_UMOUNT_CANDIDATE for +all elements we encounter, putting the ones not already in the original +set into the list of candidates. When we are done, strip that flag from +all elements of the original set. That gives a cheap way to check +if element belongs to S (in gather_candidates) and to candidates +itself (at later stages). Call that predicate is_candidate(); it would +be m->mnt_t_flags & T_UMOUNT_CANDIDATE. + +All elements of the original set are marked with MNT_UMOUNT and we'll +need the same for elements added when joining the contents of to_umount +to set in the end. Let's set MNT_UMOUNT at the time we add an element +to to_umount; that's close to what the old 'umount_one' is doing, so +let's keep that name. It also gives us another predicate we need - +"belongs to union of set and to_umount"; will_be_unmounted() for now. + +Removals from the candidates list should strip both T_MARKED and +T_UMOUNT_CANDIDATE; call it remove_from_candidates_list(). diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c index 65d84a93c5253..a09e2eb471756 100644 --- a/drivers/gpu/drm/i915/gem/i915_gemfs.c +++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c @@ -5,16 +5,23 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include "i915_drv.h" #include "i915_gemfs.h" #include "i915_utils.h" +static int add_param(struct fs_context *fc, const char *key, const char *val) +{ + return vfs_parse_fs_string(fc, key, val, strlen(val)); +} + void i915_gemfs_init(struct drm_i915_private *i915) { - char huge_opt[] = "huge=within_size"; /* r/w */ struct file_system_type *type; + struct fs_context *fc; struct vfsmount *gemfs; + int ret; /* * By creating our own shmemfs mountpoint, we can pass in @@ -38,8 +45,16 @@ void i915_gemfs_init(struct drm_i915_private *i915) if (!type) goto err; - gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt); - if (IS_ERR(gemfs)) + fc = fs_context_for_mount(type, SB_KERNMOUNT); + if (IS_ERR(fc)) + goto err; + ret = add_param(fc, "source", "tmpfs"); + if (!ret) + ret = add_param(fc, "huge", "within_size"); + if (!ret) + gemfs = fc_mount_longterm(fc); + put_fs_context(fc); + if (ret) goto err; i915->mm.gemfs = gemfs; diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c index 4c5e18590a5cf..8ec6ed82b3d94 100644 --- a/drivers/gpu/drm/v3d/v3d_gemfs.c +++ b/drivers/gpu/drm/v3d/v3d_gemfs.c @@ -3,14 +3,21 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include "v3d_drv.h" +static int add_param(struct fs_context *fc, const char *key, const char *val) +{ + return vfs_parse_fs_string(fc, key, val, strlen(val)); +} + void v3d_gemfs_init(struct v3d_dev *v3d) { - char huge_opt[] = "huge=within_size"; struct file_system_type *type; + struct fs_context *fc; struct vfsmount *gemfs; + int ret; /* * By creating our own shmemfs mountpoint, we can pass in @@ -28,8 +35,16 @@ void v3d_gemfs_init(struct v3d_dev *v3d) if (!type) goto err; - gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt); - if (IS_ERR(gemfs)) + fc = fs_context_for_mount(type, SB_KERNMOUNT); + if (IS_ERR(fc)) + goto err; + ret = add_param(fc, "source", "tmpfs"); + if (!ret) + ret = add_param(fc, "huge", "within_size"); + if (!ret) + gemfs = fc_mount_longterm(fc); + put_fs_context(fc); + if (ret) goto err; v3d->gemfs = gemfs; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6e0ade365a334..b7994186fc665 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1588,7 +1588,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; - mnt = fc_mount(fc); + mnt = fc_mount_longterm(fc); put_fs_context(fc); } if (IS_ERR(mnt)) diff --git a/fs/mount.h b/fs/mount.h index ad7173037924a..97737051a8b9d 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -44,7 +44,6 @@ struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; struct hlist_head m_list; - int m_count; }; struct mount { @@ -70,8 +69,8 @@ struct mount { struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ - struct list_head mnt_slave_list;/* list of slave mounts */ - struct list_head mnt_slave; /* slave list entry */ + struct hlist_head mnt_slave_list;/* list of slave mounts */ + struct hlist_node mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ @@ -79,21 +78,38 @@ struct mount { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; - struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; struct list_head to_notify; /* need to queue notification */ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif + int mnt_t_flags; /* namespace_sem-protected flags */ int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; +enum { + T_SHARED = 1, /* mount is shared */ + T_UNBINDABLE = 2, /* mount is unbindable */ + T_MARKED = 4, /* internal mark for propagate_... */ + T_UMOUNT_CANDIDATE = 8, /* for propagate_umount */ + + /* + * T_SHARED_MASK is the set of flags that should be cleared when a + * mount becomes shared. Currently, this is only the flag that says a + * mount cannot be bind mounted, since this is how we create a mount + * that shares events with another mount. If you add a new T_* + * flag, consider how it interacts with shared mounts. + */ + T_SHARED_MASK = T_UNBINDABLE, +}; + #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ static inline struct mount *real_mount(struct vfsmount *mnt) @@ -101,7 +117,7 @@ static inline struct mount *real_mount(struct vfsmount *mnt) return container_of(mnt, struct mount, mnt); } -static inline int mnt_has_parent(struct mount *mnt) +static inline int mnt_has_parent(const struct mount *mnt) { return mnt != mnt->mnt_parent; } @@ -146,8 +162,8 @@ struct proc_mounts { extern const struct seq_operations mounts_op; -extern bool __is_local_mountpoint(struct dentry *dentry); -static inline bool is_local_mountpoint(struct dentry *dentry) +extern bool __is_local_mountpoint(const struct dentry *dentry); +static inline bool is_local_mountpoint(const struct dentry *dentry) { if (!d_mountpoint(dentry)) return false; @@ -160,6 +176,13 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool anon_ns_root(const struct mount *m) +{ + struct mnt_namespace *ns = READ_ONCE(m->mnt_ns); + + return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root; +} + static inline bool mnt_ns_attached(const struct mount *mnt) { return !RB_EMPTY_NODE(&mnt->mnt_node); @@ -170,7 +193,7 @@ static inline bool mnt_ns_empty(const struct mnt_namespace *ns) return RB_EMPTY_ROOT(&ns->mounts); } -static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) +static inline void move_from_ns(struct mount *mnt) { struct mnt_namespace *ns = mnt->mnt_ns; WARN_ON(!mnt_ns_attached(mnt)); @@ -180,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); - list_add_tail(&mnt->mnt_list, dt_list); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 54c59e091919b..c549bd39c210a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY @@ -380,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_HEAD(&mnt->mnt_slave_list); + INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); - INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; @@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path) * namespace not just a mount that happens to have some specified * parent mount. */ -bool __is_local_mountpoint(struct dentry *dentry) +bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; @@ -911,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry) return is_covered; } -static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +struct pinned_mountpoint { + struct hlist_node node; + struct mountpoint *mp; +}; + +static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - mp->m_count++; - return mp; + hlist_add_head(&m->node, &mp->m_list); + m->mp = mp; + return true; } } - return NULL; + return false; } -static struct mountpoint *get_mountpoint(struct dentry *dentry) +static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { - struct mountpoint *mp, *new = NULL; + struct mountpoint *mp __free(kfree) = NULL; + bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); + return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); - mp = lookup_mountpoint(dentry); + found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); - if (mp) - goto done; + if (found) + return 0; } - if (!new) - new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - + if (!mp) + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); @@ -956,34 +962,28 @@ mountpoint: goto mountpoint; /* The dentry is not available as a mountpoint? */ - mp = ERR_PTR(ret); if (ret) - goto done; + return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); - new->m_dentry = dget(dentry); - new->m_count = 1; - hlist_add_head(&new->m_hash, mp_hash(dentry)); - INIT_HLIST_HEAD(&new->m_list); + mp->m_dentry = dget(dentry); + hlist_add_head(&mp->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&mp->m_list); + hlist_add_head(&m->node, &mp->m_list); + m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); - - mp = new; - new = NULL; -done: - kfree(new); - return mp; + return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ -static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { - if (!--mp->m_count) { + if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; - BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -993,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) } } -/* called with namespace_lock and vfsmount lock */ -static void put_mountpoint(struct mountpoint *mp) +/* + * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] + */ +static void unpin_mountpoint(struct pinned_mountpoint *m) { - __put_mountpoint(mp, &ex_mountpoints); + if (m->mp) { + hlist_del(&m->node); + maybe_free_mountpoint(m->mp, &ex_mountpoints); + } } static inline int check_mnt(struct mount *mnt) @@ -1038,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock] */ -static struct mountpoint *unhash_mnt(struct mount *mnt) +static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; + struct mount *parent = mnt->mnt_parent; + if (unlikely(parent->overmount == mnt)) + parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -1050,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt) hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; - return mp; + maybe_free_mountpoint(mp, shrink_list); } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { - put_mountpoint(unhash_mnt(mnt)); + __umount_mnt(mnt, &ex_mountpoints); } /* @@ -1068,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { - mp->m_count++; - mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } -/** - * mnt_set_mountpoint_beneath - mount a mount beneath another one - * - * @new_parent: the source mount - * @top_mnt: the mount beneath which @new_parent is mounted - * @new_mp: the new mountpoint of @top_mnt on @new_parent - * - * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and - * parent @top_mnt->mnt_parent and mount it on top of @new_parent at - * @new_mp. And mount @new_parent on the old parent and old - * mountpoint of @top_mnt. - * - * Context: This function expects namespace_lock() and lock_mount_hash() - * to have been acquired in that order. - */ -static void mnt_set_mountpoint_beneath(struct mount *new_parent, - struct mount *top_mnt, - struct mountpoint *new_mp) -{ - struct mount *old_top_parent = top_mnt->mnt_parent; - struct mountpoint *old_top_mp = top_mnt->mnt_mp; - - mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); - mnt_change_mountpoint(new_parent, new_mp, top_mnt); -} - - -static void __attach_mnt(struct mount *mnt, struct mount *parent) +static void make_visible(struct mount *mnt) { + struct mount *parent = mnt->mnt_parent; + if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) + parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); @@ -1116,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint - * @beneath: whether to mount @mnt beneath or on top of @parent * - * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * - * If @beneath is true, remove @mnt from its current parent and - * mountpoint and mount it on @mp on @parent, and mount @parent on the - * old parent and old mountpoint of @mnt. Finally, attach @parent to - * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. - * - * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, - struct mountpoint *mp, bool beneath) + struct mountpoint *mp) { - if (beneath) - mnt_set_mountpoint_beneath(mnt, parent, mp); - else - mnt_set_mountpoint(parent, mp, mnt); - /* - * Note, @mnt->mnt_parent has to be used. If @mnt was mounted - * beneath @parent then @mnt will need to be attached to - * @parent's old parent, not @parent. IOW, @mnt->mnt_parent - * isn't the same mount as @parent. - */ - __attach_mnt(mnt, mnt->mnt_parent); + mnt_set_mountpoint(parent, mp, mnt); + make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; - struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp, false); + attach_mnt(mnt, parent, mp); - put_mountpoint(old_mp); - mnt_add_count(old_parent, -1); + maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) @@ -1197,32 +1162,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) mnt_notify_add(mnt); } -/* - * vfsmount lock must be held for write - */ -static void commit_tree(struct mount *mnt) -{ - struct mount *parent = mnt->mnt_parent; - struct mount *m; - LIST_HEAD(head); - struct mnt_namespace *n = parent->mnt_ns; - - BUG_ON(parent == mnt); - - list_add_tail(&head, &mnt->mnt_list); - while (!list_empty(&head)) { - m = list_first_entry(&head, typeof(*m), mnt_list); - list_del(&m->mnt_list); - - mnt_add_to_ns(n, m); - } - n->nr_mounts += n->pending_mounts; - n->pending_mounts = 0; - - __attach_mnt(mnt, parent); - touch_mnt_namespace(n); -} - static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; @@ -1249,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ + struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; + + if (!mnt_ns_attached(mnt)) { + for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) + if (unlikely(mnt_ns_attached(m))) + m = skip_mnt_tree(m); + else + mnt_add_to_ns(n, m); + n->nr_mounts += n->pending_mounts; + n->pending_mounts = 0; + } + + make_visible(mnt); + touch_mnt_namespace(n); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1296,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc) } EXPORT_SYMBOL(fc_mount); +struct vfsmount *fc_mount_longterm(struct fs_context *fc) +{ + struct vfsmount *mnt = fc_mount(fc); + if (!IS_ERR(mnt)) + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + return mnt; +} +EXPORT_SYMBOL(fc_mount_longterm); + struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) @@ -1337,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & + ~MNT_INTERNAL_FLAGS; + + if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -1348,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + if (mnt->mnt_group_id) + set_mnt_shared(mnt); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1362,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); - if ((flag & CL_SLAVE) || - ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); + if (flag & CL_PRIVATE) // we are done with it + return mnt; + + if (peers(mnt, old)) + list_add(&mnt->mnt_share, &old->mnt_share); + + if ((flag & CL_SLAVE) && old->mnt_group_id) { + hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); + } else if (IS_MNT_SLAVE(old)) { + hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; - } else { - CLEAR_MNT_SHARED(mnt); } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } - return mnt; out_free: @@ -1478,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + if (unlikely(!list_empty(&mnt->mnt_expire))) + list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - __put_mountpoint(unhash_mnt(p), &list); + __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } @@ -1679,23 +1642,19 @@ const struct seq_operations mounts_op = { int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); - int actual_refs = 0; - int minimum_refs = 0; - struct mount *p; - BUG_ON(!m); + bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); - for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += mnt_get_count(p); - minimum_refs += 2; + for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { + if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { + busy = true; + break; + } } unlock_mount_hash(); - if (actual_refs > minimum_refs) - return 0; - - return 1; + return !busy; } EXPORT_SYMBOL(may_umount_tree); @@ -1771,15 +1730,18 @@ static bool need_notify_mnt_list(void) } #endif +static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; + struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + emptied_ns = NULL; if (need_notify_mnt_list()) { /* @@ -1793,6 +1755,11 @@ static void namespace_unlock(void) } else { up_write(&namespace_sem); } + if (unlikely(ns)) { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); + } shrink_dentry_list(&list); @@ -1865,9 +1832,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) - move_from_ns(p, &tmp_list); - else - list_move(&p->mnt_list, &tmp_list); + move_from_ns(p); + list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1896,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { - mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); @@ -1973,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags) * all race cases, but it's a slowpath. */ lock_mount_hash(); - if (mnt_get_count(mnt) != 2) { + if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } @@ -2019,23 +1984,27 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - /* Recheck MNT_LOCKED with the locks held */ + /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; + if (!check_mnt(mnt)) + goto out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; + if (!mnt_has_parent(mnt)) /* not the absolute root */ + goto out; + event++; if (flags & MNT_DETACH) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -2057,25 +2026,24 @@ out: */ void __detach_mounts(struct dentry *dentry) { - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; namespace_lock(); lock_mount_hash(); - mp = lookup_mountpoint(dentry); - if (!mp) + if (!lookup_mountpoint(dentry, &mp)) goto out_unlock; event++; - while (!hlist_empty(&mp->m_list)) { - mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + while (mp.node.next) { + mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } - put_mountpoint(mp); + unpin_mountpoint(&mp); out_unlock: unlock_mount_hash(); namespace_unlock(); @@ -2259,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; src_parent = src_root; - dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) @@ -2294,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); - attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) + dst_mnt->mnt.mnt_flags |= MNT_LOCKED; + if (unlikely(flag & CL_EXPIRE)) { + /* stick the duplicate mount on the same expiry + * list as the original if that was on one */ + if (!list_empty(&src_mnt->mnt_expire)) + list_add(&dst_mnt->mnt_expire, + &src_mnt->mnt_expire); + } + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } @@ -2368,78 +2343,36 @@ void drop_collected_paths(struct path *paths, struct path *prealloc) kfree(paths); } -static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); -static inline bool must_dissolve(struct mnt_namespace *mnt_ns) -{ - /* - * This mount belonged to an anonymous mount namespace - * but was moved to a non-anonymous mount namespace and - * then unmounted. - */ - if (unlikely(!mnt_ns)) - return false; - - /* - * This mount belongs to a non-anonymous mount namespace - * and we know that such a mount can never transition to - * an anonymous mount namespace again. - */ - if (!is_anon_ns(mnt_ns)) { - /* - * A detached mount either belongs to an anonymous mount - * namespace or a non-anonymous mount namespace. It - * should never belong to something purely internal. - */ - VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); - return false; - } - - return true; -} - void dissolve_on_fput(struct vfsmount *mnt) { - struct mnt_namespace *ns; struct mount *m = real_mount(mnt); + /* + * m used to be the root of anon namespace; if it still is one, + * we need to dissolve the mount tree and free that namespace. + * Let's try to avoid taking namespace_sem if we can determine + * that there's nothing to do without it - rcu_read_lock() is + * enough to make anon_ns_root() memory-safe and once m has + * left its namespace, it's no longer our concern, since it will + * never become a root of anon ns again. + */ + scoped_guard(rcu) { - if (!must_dissolve(READ_ONCE(m->mnt_ns))) + if (!anon_ns_root(m)) return; } scoped_guard(namespace_lock, &namespace_sem) { - ns = m->mnt_ns; - if (!must_dissolve(ns)) - return; - - /* - * After must_dissolve() we know that this is a detached - * mount in an anonymous mount namespace. - * - * Now when mnt_has_parent() reports that this mount - * tree has a parent, we know that this anonymous mount - * tree has been moved to another anonymous mount - * namespace. - * - * So when closing this file we cannot unmount the mount - * tree. This will be done when the file referring to - * the root of the anonymous mount namespace will be - * closed (It could already be closed but it would sync - * on @namespace_sem and wait for us to finish.). - */ - if (mnt_has_parent(m)) + if (!anon_ns_root(m)) return; + emptied_ns = m->mnt_ns; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } - - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); } static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) @@ -2518,9 +2451,7 @@ struct vfsmount *clone_private_mount(const struct path *path) * loops get created. */ if (!check_mnt(old_mnt)) { - if (!is_mounted(&old_mnt->mnt) || - !is_anon_ns(old_mnt->mnt_ns) || - mnt_has_parent(old_mnt)) + if (!anon_ns_root(old_mnt)) return ERR_PTR(-EINVAL); if (!check_for_nsfs_mounts(old_mnt)) @@ -2564,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt) if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ - if (list_empty(&p->mnt_expire)) + if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } @@ -2585,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse) struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { - if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); @@ -2621,17 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) } enum mnt_tree_flags_t { - MNT_TREE_MOVE = BIT(0), - MNT_TREE_BENEATH = BIT(1), - MNT_TREE_PROPAGATION = BIT(2), + MNT_TREE_BENEATH = BIT(0), + MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mnt: mount that @source_mnt will be mounted on * @dest_mp: the mountpoint @source_mnt will be mounted at - * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2694,26 +2623,31 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *top_mnt, - struct mountpoint *dest_mp, - enum mnt_tree_flags_t flags) + struct mount *dest_mnt, + struct mountpoint *dest_mp) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = top_mnt->mnt_ns; - struct mountpoint *smp; - struct mount *child, *dest_mnt, *p; + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct pinned_mountpoint root = {}; + struct mountpoint *shorter = NULL; + struct mount *child, *p; + struct mount *top; struct hlist_node *n; int err = 0; - bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; + bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ - smp = get_mountpoint(source_mnt->mnt.mnt_root); - if (IS_ERR(smp)) - return PTR_ERR(smp); + for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { + if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) + shorter = top->mnt_mp; + } + err = get_mountpoint(top->mnt.mnt_root, &root); + if (err) + return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { @@ -2722,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } - if (beneath) - dest_mnt = top_mnt->mnt_parent; - else - dest_mnt = top_mnt; - if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -2743,42 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (moving) { - if (beneath) - dest_mp = smp; - unhash_mnt(source_mnt); - attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + umount_mnt(source_mnt); mnt_notify_add(source_mnt); - touch_mnt_namespace(source_mnt->mnt_ns); + /* if the mount is moved, it should no longer be expired + * automatically */ + list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { - LIST_HEAD(head); - /* move from anon - the caller will destroy */ + emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) - move_from_ns(p, &head); - list_del_init(&head); + move_from_ns(p); } - if (beneath) - mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); - else - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); } + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + /* + * Now the original copy is in the same state as the secondaries - + * its root attached to mountpoint, but not hashed and all mounts + * in it are either in our namespace or in no namespace at all. + * Add the original to the list of copies and deal with the + * rest of work for all of them uniformly. + */ + hlist_add_head(&source_mnt->mnt_hash, &tree_list); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); - child->mnt.mnt_flags &= ~MNT_LOCKED; q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); + if (q) { + struct mountpoint *mp = root.mp; + struct mount *r = child; + while (unlikely(r->overmount)) + r = r->overmount; + if (unlikely(shorter) && child != source_mnt) + mp = shorter; + mnt_change_mountpoint(r, mp, q); + } commit_tree(child); } - put_mountpoint(smp); + unpin_mountpoint(&root); unlock_mount_hash(); return 0; @@ -2795,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); - put_mountpoint(smp); + unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; @@ -2835,12 +2772,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ -static struct mountpoint *do_lock_mount(struct path *path, bool beneath) +static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; - struct mountpoint *mp = ERR_PTR(-ENOENT); struct path under = {}; + int err = -ENOENT; for (;;) { struct mount *m = real_mount(mnt); @@ -2878,8 +2815,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) path->dentry = dget(mnt->mnt_root); continue; // got overmounted } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) + err = get_mountpoint(dentry, pinned); + if (err) break; if (beneath) { /* @@ -2890,25 +2827,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) */ path_put(&under); } - return mp; + return 0; } namespace_unlock(); inode_unlock(dentry->d_inode); if (beneath) path_put(&under); - return mp; + return err; } -static inline struct mountpoint *lock_mount(struct path *path) +static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) { - return do_lock_mount(path, false); + return do_lock_mount(path, m, false); } -static void unlock_mount(struct mountpoint *where) +static void unlock_mount(struct pinned_mountpoint *m) { - inode_unlock(where->m_dentry->d_inode); + inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); - put_mountpoint(where); + unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); namespace_unlock(); } @@ -2922,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, 0); + return attach_recursive_mnt(mnt, p, mp); } /* @@ -2971,10 +2908,8 @@ static int do_change_type(struct path *path, int ms_flags) goto out_unlock; } - lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - unlock_mount_hash(); out_unlock: namespace_unlock(); @@ -3048,26 +2983,21 @@ static inline bool may_copy_tree(struct path *path) static struct mount *__do_loopback(struct path *old_path, int recurse) { - struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) - return mnt; + return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) - return mnt; + return ERR_PTR(-EINVAL); if (!recurse && __has_locked_children(old, old_path->dentry)) - return mnt; + return ERR_PTR(-EINVAL); if (recurse) - mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old, old_path->dentry, 0); - - if (!IS_ERR(mnt)) - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - - return mnt; + return clone_mnt(old, old_path->dentry, 0); } /* @@ -3078,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name, { struct path old_path; struct mount *mnt = NULL, *parent; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; int err; if (!old_name || !*old_name) return -EINVAL; @@ -3090,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(old_path.dentry)) goto out; - mp = lock_mount(path); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = lock_mount(path, &mp); + if (err) goto out; - } parent = real_mount(path->mnt); if (!check_mnt(parent)) @@ -3106,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } - err = graft_tree(mnt, parent, mp); + err = graft_tree(mnt, parent, mp.mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: - unlock_mount(mp); + unlock_mount(&mp); out: path_put(&old_path); return err; @@ -3461,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; if (IS_MNT_SLAVE(from)) { - struct mount *m = from->mnt_master; - - list_add(&to->mnt_slave, &from->mnt_slave); - to->mnt_master = m; + hlist_add_behind(&to->mnt_slave, &from->mnt_slave); + to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); - lock_mount_hash(); set_mnt_shared(to); - unlock_mount_hash(); } err = 0; @@ -3509,6 +3433,17 @@ static inline bool path_overmounted(const struct path *path) return unlikely(!no_child); } +/* + * Check if there is a possibly empty chain of descent from p1 to p2. + * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). + */ +static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) +{ + while (p2 != p1 && mnt_has_parent(p2)) + p2 = p2->mnt_parent; + return p2 == p1; +} + /** * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath @@ -3560,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from, if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; - for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) - if (p == mnt_to) - return -EINVAL; + if (mount_is_ancestor(mnt_to, mnt_from)) + return -EINVAL; /* * If the parent mount propagates to the child mount this would @@ -3647,27 +3581,20 @@ static int do_move_mount(struct path *old_path, struct mount *p; struct mount *old; struct mount *parent; - struct mountpoint *mp, *old_mp; + struct pinned_mountpoint mp; int err; - bool attached, beneath = flags & MNT_TREE_BENEATH; + bool beneath = flags & MNT_TREE_BENEATH; - mp = do_lock_mount(new_path, beneath); - if (IS_ERR(mp)) - return PTR_ERR(mp); + err = do_lock_mount(new_path, &mp, beneath); + if (err) + return err; old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; - attached = mnt_has_parent(old); - if (attached) - flags |= MNT_TREE_MOVE; - old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; - /* The thing moved must be mounted... */ - if (!is_mounted(&old->mnt)) - goto out; if (check_mnt(old)) { /* if the source is in our namespace... */ @@ -3677,13 +3604,14 @@ static int do_move_mount(struct path *old_path, /* ... and the target should be in our namespace */ if (!check_mnt(p)) goto out; + /* parent of the source should not be shared */ + if (IS_MNT_SHARED(parent)) + goto out; } else { /* * otherwise the source must be the root of some anon namespace. - * AV: check for mount being root of an anon namespace is worth - * an inlined predicate... */ - if (!is_anon_ns(ns) || mnt_has_parent(old)) + if (!anon_ns_root(old)) goto out; /* * Bail out early if the target is within the same namespace - @@ -3706,20 +3634,14 @@ static int do_move_mount(struct path *old_path, if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; - /* - * Don't move a mount residing in a shared parent. - */ - if (attached && IS_MNT_SHARED(parent)) - goto out; if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp); + err = can_move_mount_beneath(old_path, new_path, mp.mp); if (err) goto out; err = -EINVAL; p = p->mnt_parent; - flags |= MNT_TREE_BENEATH; } /* @@ -3731,30 +3653,12 @@ static int do_move_mount(struct path *old_path, err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; - for (; mnt_has_parent(p); p = p->mnt_parent) - if (p == old) - goto out; - - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); - if (err) + if (mount_is_ancestor(old, p)) goto out; - /* if the mount is moved, it should no longer be expire - * automatically */ - list_del_init(&old->mnt_expire); - if (attached) - put_mountpoint(old_mp); + err = attach_recursive_mnt(old, p, mp.mp); out: - unlock_mount(mp); - if (!err) { - if (attached) { - mntput_no_expire(parent); - } else { - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); - } - } + unlock_mount(&mp); return err; } @@ -3815,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct super_block *sb = fc->root->d_sb; int error; @@ -3836,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - mp = lock_mount(mountpoint); - if (IS_ERR(mp)) { - mntput(mnt); - return PTR_ERR(mp); + error = lock_mount(mountpoint, &mp); + if (!error) { + error = do_add_mount(real_mount(mnt), mp.mp, + mountpoint, mnt_flags); + unlock_mount(&mp); } - error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); - unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -3910,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3942,14 +3845,13 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = 0; goto discard_locked; } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = get_mountpoint(dentry, &mp); + if (err) goto discard_locked; - } - err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(mp); + err = do_add_mount(mnt, mp.mp, path, + path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(&mp); if (unlikely(err)) goto discard; return 0; @@ -3958,12 +3860,6 @@ discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: - /* remove m from any expiration list it may be on */ - if (!list_empty(&mnt->mnt_expire)) { - namespace_lock(); - list_del_init(&mnt->mnt_expire); - namespace_unlock(); - } mntput(m); return err; } @@ -3975,11 +3871,9 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - namespace_lock(); - + read_seqlock_excl(&mount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - - namespace_unlock(); + read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -4333,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); @@ -4758,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct mountpoint *old_mp, *root_mp; + struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4779,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - old_mp = lock_mount(&old); - error = PTR_ERR(old_mp); - if (IS_ERR(old_mp)) + error = lock_mount(&old, &old_mp); + if (error) goto out3; error = -EINVAL; @@ -4808,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; @@ -4821,29 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; lock_mount_hash(); umount_mnt(new_mnt); - root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } - /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp, false); - mnt_add_count(root_parent, -1); + attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); + umount_mnt(root_mnt); + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); - put_mountpoint(root_mp); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: - unlock_mount(old_mp); - if (!error) - mntput_no_expire(ex_parent); + unlock_mount(&old_mp); out3: path_put(&root); out2: @@ -5045,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) err = -EINVAL; lock_mount_hash(); - /* Ensure that this isn't anything purely vfs internal. */ - if (!is_mounted(&mnt->mnt)) - goto out; - - /* - * If this is an attached mount make sure it's located in the callers - * mount namespace. If it's not don't let the caller interact with it. - * - * If this mount doesn't have a parent it's most often simply a - * detached mount with an anonymous mount namespace. IOW, something - * that's simply not attached yet. But there are apparently also users - * that do change mount properties on the rootfs itself. That obviously - * neither has a parent nor is it a detached mount so we cannot - * unconditionally check for detached mounts. - */ - if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) + if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* @@ -5424,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s) s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); - s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } @@ -6228,7 +6102,6 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; - mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -6276,11 +6149,11 @@ void put_mnt_ns(struct mnt_namespace *ns) if (!refcount_dec_and_test(&ns->ns.count)) return; namespace_lock(); + emptied_ns = ns; lock_mount_hash(); umount_tree(ns->root, 0); unlock_mount_hash(); namespace_unlock(); - free_mnt_ns(ns); } struct vfsmount *kern_mount(struct file_system_type *type) diff --git a/fs/pnode.c b/fs/pnode.c index ffd429b760d5d..81f7599bdac4f 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -21,17 +21,12 @@ static inline struct mount *next_peer(struct mount *p) static inline struct mount *first_slave(struct mount *p) { - return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); -} - -static inline struct mount *last_slave(struct mount *p) -{ - return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); + return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { - return list_entry(p->mnt_slave.next, struct mount, mnt_slave); + return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } static struct mount *get_peer_under_root(struct mount *mnt, @@ -70,69 +65,90 @@ int get_dominating_id(struct mount *mnt, const struct path *root) return 0; } -static int do_make_slave(struct mount *mnt) +static inline bool will_be_unmounted(struct mount *m) { - struct mount *master, *slave_mnt; + return m->mnt.mnt_flags & MNT_UMOUNT; +} - if (list_empty(&mnt->mnt_share)) { - if (IS_MNT_SHARED(mnt)) { - mnt_release_group_id(mnt); - CLEAR_MNT_SHARED(mnt); - } - master = mnt->mnt_master; - if (!master) { - struct list_head *p = &mnt->mnt_slave_list; - while (!list_empty(p)) { - slave_mnt = list_first_entry(p, - struct mount, mnt_slave); - list_del_init(&slave_mnt->mnt_slave); - slave_mnt->mnt_master = NULL; - } - return 0; - } - } else { +static struct mount *propagation_source(struct mount *mnt) +{ + do { struct mount *m; - /* - * slave 'mnt' to a peer mount that has the - * same root dentry. If none is available then - * slave it to anything that is available. - */ - for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (m->mnt.mnt_root == mnt->mnt.mnt_root) { - master = m; - break; - } + for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { + if (!will_be_unmounted(m)) + return m; } - list_del_init(&mnt->mnt_share); - mnt->mnt_group_id = 0; - CLEAR_MNT_SHARED(mnt); + mnt = mnt->mnt_master; + } while (mnt && will_be_unmounted(mnt)); + return mnt; +} + +static void transfer_propagation(struct mount *mnt, struct mount *to) +{ + struct hlist_node *p = NULL, *n; + struct mount *m; + + hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) { + m->mnt_master = to; + if (!to) + hlist_del_init(&m->mnt_slave); + else + p = &m->mnt_slave; } - list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) - slave_mnt->mnt_master = master; - list_move(&mnt->mnt_slave, &master->mnt_slave_list); - list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - mnt->mnt_master = master; - return 0; + if (p) + hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list); } /* - * vfsmount lock must be held for write + * EXCL[namespace_sem] */ void change_mnt_propagation(struct mount *mnt, int type) { + struct mount *m = mnt->mnt_master; + if (type == MS_SHARED) { set_mnt_shared(mnt); return; } - do_make_slave(mnt); - if (type != MS_SLAVE) { - list_del_init(&mnt->mnt_slave); + if (IS_MNT_SHARED(mnt)) { + m = propagation_source(mnt); + if (list_empty(&mnt->mnt_share)) { + mnt_release_group_id(mnt); + } else { + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + } + CLEAR_MNT_SHARED(mnt); + transfer_propagation(mnt, m); + } + hlist_del_init(&mnt->mnt_slave); + if (type == MS_SLAVE) { + mnt->mnt_master = m; + if (m) + hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list); + } else { mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) - mnt->mnt.mnt_flags |= MNT_UNBINDABLE; + mnt->mnt_t_flags |= T_UNBINDABLE; else - mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; + mnt->mnt_t_flags &= ~T_UNBINDABLE; + } +} + +static struct mount *__propagation_next(struct mount *m, + struct mount *origin) +{ + while (1) { + struct mount *master = m->mnt_master; + + if (master == origin->mnt_master) { + struct mount *next = next_peer(m); + return (next == origin) ? NULL : next; + } else if (m->mnt_slave.next) + return next_slave(m); + + /* back at master */ + m = master; } } @@ -150,34 +166,24 @@ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); - while (1) { - struct mount *master = m->mnt_master; - - if (master == origin->mnt_master) { - struct mount *next = next_peer(m); - return (next == origin) ? NULL : next; - } else if (m->mnt_slave.next != &master->mnt_slave_list) - return next_slave(m); - - /* back at master */ - m = master; - } + return __propagation_next(m, origin); } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* - * Advance m such that propagation_next will not return - * the slaves of m. + * Advance m past everything that gets propagation from it. */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) - m = last_slave(m); + struct mount *p = __propagation_next(m, origin); + + while (p && peers(m, p)) + p = __propagation_next(p, origin); - return m; + return p; } static struct mount *next_group(struct mount *m, struct mount *origin) @@ -185,7 +191,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) while (1) { while (1) { struct mount *next; - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { @@ -198,7 +204,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; - if (m->mnt_slave.next != &master->mnt_slave_list) + if (m->mnt_slave.next) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) @@ -212,142 +218,113 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } } -/* all accesses are serialized by namespace_sem */ -static struct mount *last_dest, *first_source, *last_source, *dest_master; -static struct hlist_head *list; - -static inline bool peers(const struct mount *m1, const struct mount *m2) -{ - return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; -} - -static int propagate_one(struct mount *m, struct mountpoint *dest_mp) +static bool need_secondary(struct mount *m, struct mountpoint *dest_mp) { - struct mount *child; - int type; /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) - return 0; + return false; /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) - return 0; + return false; /* skip if m is in the anon_ns */ if (is_anon_ns(m->mnt_ns)) - return 0; + return false; + return true; +} - if (peers(m, last_dest)) { - type = CL_MAKE_SHARED; - } else { - struct mount *n, *p; - bool done; - for (n = m; ; n = p) { - p = n->mnt_master; - if (p == dest_master || IS_MNT_MARKED(p)) - break; +static struct mount *find_master(struct mount *m, + struct mount *last_copy, + struct mount *original) +{ + struct mount *p; + + // ascend until there's a copy for something with the same master + for (;;) { + p = m->mnt_master; + if (!p || IS_MNT_MARKED(p)) + break; + m = p; + } + while (!peers(last_copy, original)) { + struct mount *parent = last_copy->mnt_parent; + if (parent->mnt_master == p) { + if (!peers(parent, m)) + last_copy = last_copy->mnt_master; + break; } - do { - struct mount *parent = last_source->mnt_parent; - if (peers(last_source, first_source)) - break; - done = parent->mnt_master == p; - if (done && peers(n, parent)) - break; - last_source = last_source->mnt_master; - } while (!done); - - type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(m)) - type |= CL_MAKE_SHARED; + last_copy = last_copy->mnt_master; } - - child = copy_tree(last_source, last_source->mnt.mnt_root, type); - if (IS_ERR(child)) - return PTR_ERR(child); - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(m, dest_mp, child); - if (m->mnt_master != dest_master) - SET_MNT_MARK(m->mnt_master); - read_sequnlock_excl(&mount_lock); - last_dest = m; - last_source = child; - hlist_add_head(&child->mnt_hash, list); - return count_mounts(m->mnt_ns, child); + return last_copy; } -/* - * mount 'source_mnt' under the destination 'dest_mnt' at - * dentry 'dest_dentry'. And propagate that mount to - * all the peer and slave mounts of 'dest_mnt'. - * Link all the new mounts into a propagation tree headed at - * source_mnt. Also link all the new mounts using ->mnt_list - * headed at source_mnt's ->mnt_list +/** + * propagate_mnt() - create secondary copies for tree attachment + * @dest_mnt: destination mount. + * @dest_mp: destination mountpoint. + * @source_mnt: source mount. + * @tree_list: list of secondaries to be attached. * - * @dest_mnt: destination mount. - * @dest_dentry: destination dentry. - * @source_mnt: source mount. - * @tree_list : list of heads of trees to be attached. + * Create secondary copies for attaching a tree with root @source_mnt + * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts + * into a propagation graph. Set mountpoints for all secondaries, + * link their roots into @tree_list via ->mnt_hash. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, - struct mount *source_mnt, struct hlist_head *tree_list) + struct mount *source_mnt, struct hlist_head *tree_list) { - struct mount *m, *n; - int ret = 0; - - /* - * we don't want to bother passing tons of arguments to - * propagate_one(); everything is serialized by namespace_sem, - * so globals will do just fine. - */ - last_dest = dest_mnt; - first_source = source_mnt; - last_source = source_mnt; - list = tree_list; - dest_master = dest_mnt->mnt_master; - - /* all peers of dest_mnt, except dest_mnt itself */ - for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { - ret = propagate_one(n, dest_mp); - if (ret) - goto out; - } - - /* all slave groups */ - for (m = next_group(dest_mnt, dest_mnt); m; - m = next_group(m, dest_mnt)) { - /* everything in that slave group */ - n = m; + struct mount *m, *n, *copy, *this; + int err = 0, type; + + if (dest_mnt->mnt_master) + SET_MNT_MARK(dest_mnt->mnt_master); + + /* iterate over peer groups, depth first */ + for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) { + if (m == dest_mnt) { // have one for dest_mnt itself + copy = source_mnt; + type = CL_MAKE_SHARED; + n = next_peer(m); + if (n == m) + continue; + } else { + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; + n = m; + } do { - ret = propagate_one(n, dest_mp); - if (ret) - goto out; - n = next_peer(n); - } while (n != m); + if (!need_secondary(n, dest_mp)) + continue; + if (type & CL_SLAVE) // first in this peer group + copy = find_master(n, copy, source_mnt); + this = copy_tree(copy, copy->mnt.mnt_root, type); + if (IS_ERR(this)) { + err = PTR_ERR(this); + break; + } + read_seqlock_excl(&mount_lock); + mnt_set_mountpoint(n, dest_mp, this); + read_sequnlock_excl(&mount_lock); + if (n->mnt_master) + SET_MNT_MARK(n->mnt_master); + copy = this; + hlist_add_head(&this->mnt_hash, tree_list); + err = count_mounts(n->mnt_ns, this); + if (err) + break; + type = CL_MAKE_SHARED; + } while ((n = next_peer(n)) != m); } -out: - read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; - if (m->mnt_master != dest_mnt->mnt_master) + if (m->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } - read_sequnlock_excl(&mount_lock); - return ret; -} - -static struct mount *find_topper(struct mount *mnt) -{ - /* If there is exactly one mount covering mnt completely return it. */ - struct mount *child; - - if (!list_is_singular(&mnt->mnt_mounts)) - return NULL; - - child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); - if (child->mnt_mountpoint != mnt->mnt.mnt_root) - return NULL; - - return child; + if (dest_mnt->mnt_master) + CLEAR_MNT_MARK(dest_mnt->mnt_master); + return err; } /* @@ -407,12 +384,8 @@ bool propagation_would_overmount(const struct mount *from, */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - if (mnt == parent) - return do_refcount_check(mnt, refcnt); - /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other @@ -421,23 +394,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; - for (m = propagation_next(parent, parent); m; + if (mnt == parent) + return 0; + + for (struct mount *m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - int count = 1; - child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - if (!child) - continue; + struct list_head *head; + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - /* Is there exactly one mount on the child that covers - * it completely whose reference should be ignored? - */ - topper = find_topper(child); - if (topper) - count += 1; - else if (!list_empty(&child->mnt_mounts)) + if (!child) continue; - if (do_refcount_check(child, count)) + head = &child->mnt_mounts; + if (!list_empty(head)) { + /* + * a mount that covers child completely wouldn't prevent + * it being pulled out; any other would. + */ + if (!list_is_singular(head) || !child->overmount) + continue; + } + if (do_refcount_check(child, 1)) return 1; } return 0; @@ -463,181 +440,209 @@ void propagate_mount_unlock(struct mount *mnt) } } -static void umount_one(struct mount *mnt, struct list_head *to_umount) +static inline bool is_candidate(struct mount *m) { - CLEAR_MNT_MARK(mnt); - mnt->mnt.mnt_flags |= MNT_UMOUNT; - list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_umounting); - move_from_ns(mnt, to_umount); + return m->mnt_t_flags & T_UMOUNT_CANDIDATE; } -/* - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its - * parent propagates to. - */ -static bool __propagate_umount(struct mount *mnt, - struct list_head *to_umount, - struct list_head *to_restore) +static void umount_one(struct mount *m, struct list_head *to_umount) { - bool progress = false; - struct mount *child; - - /* - * The state of the parent won't change if this mount is - * already unmounted or marked as without children. - */ - if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) - goto out; + m->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&m->mnt_child); + move_from_ns(m); + list_add_tail(&m->mnt_list, to_umount); +} - /* Verify topper is the only grandchild that has not been - * speculatively unmounted. - */ - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - if (child->mnt_mountpoint == mnt->mnt.mnt_root) - continue; - if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) - continue; - /* Found a mounted child */ - goto children; - } +static void remove_from_candidate_list(struct mount *m) +{ + m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE); + list_del_init(&m->mnt_list); +} - /* Mark mounts that can be unmounted if not locked */ - SET_MNT_MARK(mnt); - progress = true; +static void gather_candidates(struct list_head *set, + struct list_head *candidates) +{ + struct mount *m, *p, *q; - /* If a mount is without children and not locked umount it. */ - if (!IS_MNT_LOCKED(mnt)) { - umount_one(mnt, to_umount); - } else { -children: - list_move_tail(&mnt->mnt_umounting, to_restore); + list_for_each_entry(m, set, mnt_list) { + if (is_candidate(m)) + continue; + m->mnt_t_flags |= T_UMOUNT_CANDIDATE; + p = m->mnt_parent; + q = propagation_next(p, p); + while (q) { + struct mount *child = __lookup_mnt(&q->mnt, + m->mnt_mountpoint); + if (child) { + /* + * We might've already run into this one. That + * must've happened on earlier iteration of the + * outer loop; in that case we can skip those + * parents that get propagation from q - there + * will be nothing new on those as well. + */ + if (is_candidate(child)) { + q = skip_propagation_subtree(q, p); + continue; + } + child->mnt_t_flags |= T_UMOUNT_CANDIDATE; + if (!will_be_unmounted(child)) + list_add(&child->mnt_list, candidates); + } + q = propagation_next(q, p); + } } -out: - return progress; + list_for_each_entry(m, set, mnt_list) + m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } -static void umount_list(struct list_head *to_umount, - struct list_head *to_restore) +/* + * We know that some child of @m can't be unmounted. In all places where the + * chain of descent of @m has child not overmounting the root of parent, + * the parent can't be unmounted either. + */ +static void trim_ancestors(struct mount *m) { - struct mount *mnt, *child, *tmp; - list_for_each_entry(mnt, to_umount, mnt_list) { - list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { - /* topper? */ - if (child->mnt_mountpoint == mnt->mnt.mnt_root) - list_move_tail(&child->mnt_umounting, to_restore); - else - umount_one(child, to_umount); - } + struct mount *p; + + for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) { + if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts + return; + SET_MNT_MARK(m); + if (m != p->overmount) + p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } } -static void restore_mounts(struct list_head *to_restore) +/* + * Find and exclude all umount candidates forbidden by @m + * (see Documentation/filesystems/propagate_umount.txt) + * If we can immediately tell that @m is OK to unmount (unlocked + * and all children are already committed to unmounting) commit + * to unmounting it. + * Only @m itself might be taken from the candidates list; + * anything found by trim_ancestors() is marked non-candidate + * and left on the list. + */ +static void trim_one(struct mount *m, struct list_head *to_umount) { - /* Restore mounts to a clean working state */ - while (!list_empty(to_restore)) { - struct mount *mnt, *parent; - struct mountpoint *mp; - - mnt = list_first_entry(to_restore, struct mount, mnt_umounting); - CLEAR_MNT_MARK(mnt); - list_del_init(&mnt->mnt_umounting); - - /* Should this mount be reparented? */ - mp = mnt->mnt_mp; - parent = mnt->mnt_parent; - while (parent->mnt.mnt_flags & MNT_UMOUNT) { - mp = parent->mnt_mp; - parent = parent->mnt_parent; - } - if (parent != mnt->mnt_parent) { - mnt_change_mountpoint(parent, mp, mnt); - mnt_notify_add(mnt); + bool remove_this = false, found = false, umount_this = false; + struct mount *n; + + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } + + list_for_each_entry(n, &m->mnt_mounts, mnt_child) { + if (!is_candidate(n)) { + found = true; + if (n != m->overmount) { + remove_this = true; + break; + } } } + if (found) { + trim_ancestors(m); + } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) { + remove_this = true; + umount_this = true; + } + if (remove_this) { + remove_from_candidate_list(m); + if (umount_this) + umount_one(m, to_umount); + } } -static void cleanup_umount_visitations(struct list_head *visited) +static void handle_locked(struct mount *m, struct list_head *to_umount) { - while (!list_empty(visited)) { - struct mount *mnt = - list_first_entry(visited, struct mount, mnt_umounting); - list_del_init(&mnt->mnt_umounting); + struct mount *cutoff = m, *p; + + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } + for (p = m; is_candidate(p); p = p->mnt_parent) { + remove_from_candidate_list(p); + if (!IS_MNT_LOCKED(p)) + cutoff = p->mnt_parent; + } + if (will_be_unmounted(p)) + cutoff = p; + while (m != cutoff) { + umount_one(m, to_umount); + m = m->mnt_parent; } } /* - * collect all mounts that receive propagation from the mount in @list, - * and return these additional mounts in the same list. - * @list: the list of mounts to be unmounted. + * @m is not to going away, and it overmounts the top of a stack of mounts + * that are going away. We know that all of those are fully overmounted + * by the one above (@m being the topmost of the chain), so @m can be slid + * in place where the bottom of the stack is attached. * - * vfsmount lock must be held for write + * NOTE: here we temporarily violate a constraint - two mounts end up with + * the same parent and mountpoint; that will be remedied as soon as we + * return from propagate_umount() - its caller (umount_tree()) will detach + * the stack from the parent it (and now @m) is attached to. umount_tree() + * might choose to keep unmounted pieces stuck to each other, but it always + * detaches them from the mounts that remain in the tree. */ -int propagate_umount(struct list_head *list) +static void reparent(struct mount *m) { - struct mount *mnt; - LIST_HEAD(to_restore); - LIST_HEAD(to_umount); - LIST_HEAD(visited); - - /* Find candidates for unmounting */ - list_for_each_entry_reverse(mnt, list, mnt_list) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; + struct mount *p = m; + struct mountpoint *mp; - /* - * If this mount has already been visited it is known that it's - * entire peer group and all of their slaves in the propagation - * tree for the mountpoint has already been visited and there is - * no need to visit them again. - */ - if (!list_empty(&mnt->mnt_umounting)) - continue; + do { + mp = p->mnt_mp; + p = p->mnt_parent; + } while (will_be_unmounted(p)); - list_add_tail(&mnt->mnt_umounting, &visited); - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - if (!child) - continue; + mnt_change_mountpoint(p, mp, m); + mnt_notify_add(m); +} - if (!list_empty(&child->mnt_umounting)) { - /* - * If the child has already been visited it is - * know that it's entire peer group and all of - * their slaves in the propgation tree for the - * mountpoint has already been visited and there - * is no need to visit this subtree again. - */ - m = skip_propagation_subtree(m, parent); - continue; - } else if (child->mnt.mnt_flags & MNT_UMOUNT) { - /* - * We have come across a partially unmounted - * mount in a list that has not been visited - * yet. Remember it has been visited and - * continue about our merry way. - */ - list_add_tail(&child->mnt_umounting, &visited); - continue; - } +/** + * propagate_umount - apply propagation rules to the set of mounts for umount() + * @set: the list of mounts to be unmounted. + * + * Collect all mounts that receive propagation from the mount in @set and have + * no obstacles to being unmounted. Add these additional mounts to the set. + * + * See Documentation/filesystems/propagate_umount.txt if you do anything in + * this area. + * + * Locks held: + * mount_lock (write_seqlock), namespace_sem (exclusive). + */ +void propagate_umount(struct list_head *set) +{ + struct mount *m, *p; + LIST_HEAD(to_umount); // committed to unmounting + LIST_HEAD(candidates); // undecided umount candidates - /* Check the child and parents while progress is made */ - while (__propagate_umount(child, - &to_umount, &to_restore)) { - /* Is the parent a umount candidate? */ - child = child->mnt_parent; - if (list_empty(&child->mnt_umounting)) - break; - } - } + // collect all candidates + gather_candidates(set, &candidates); + + // reduce the set until it's non-shifting + list_for_each_entry_safe(m, p, &candidates, mnt_list) + trim_one(m, &to_umount); + + // ... and non-revealing + while (!list_empty(&candidates)) { + m = list_first_entry(&candidates,struct mount, mnt_list); + handle_locked(m, &to_umount); } - umount_list(&to_umount, &to_restore); - restore_mounts(&to_restore); - cleanup_umount_visitations(&visited); - list_splice_tail(&to_umount, list); + // now to_umount consists of all acceptable candidates + // deal with reparenting of remaining overmounts on those + list_for_each_entry(m, &to_umount, mnt_list) { + if (m->overmount) + reparent(m->overmount); + } - return 0; + // and fold them into the set + list_splice_tail_init(&to_umount, set); } diff --git a/fs/pnode.h b/fs/pnode.h index 2d026fb98b182..00ab153e3e9d3 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -10,14 +10,14 @@ #include <linux/list.h> #include "mount.h" -#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) +#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) #define IS_MNT_NEW(m) (!(m)->mnt_ns) -#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) -#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) -#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) -#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) -#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED) +#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 @@ -25,19 +25,26 @@ #define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 -#define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 +/* + * EXCL[namespace_sem] + */ static inline void set_mnt_shared(struct mount *mnt) { - mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; - mnt->mnt.mnt_flags |= MNT_SHARED; + mnt->mnt_t_flags &= ~T_SHARED_MASK; + mnt->mnt_t_flags |= T_SHARED; +} + +static inline bool peers(const struct mount *m1, const struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); -int propagate_umount(struct list_head *); +void propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); diff --git a/include/linux/mount.h b/include/linux/mount.h index 1a508beba4460..5f9c053b08971 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -35,9 +35,6 @@ enum mount_flags { MNT_SHRINKABLE = 0x100, MNT_WRITE_HOLD = 0x200, - MNT_SHARED = 0x1000, /* if the vfsmount is a shared mount */ - MNT_UNBINDABLE = 0x2000, /* if the vfsmount is a unbindable mount */ - MNT_INTERNAL = 0x4000, MNT_LOCK_ATIME = 0x040000, @@ -48,25 +45,15 @@ enum mount_flags { MNT_LOCKED = 0x800000, MNT_DOOMED = 0x1000000, MNT_SYNC_UMOUNT = 0x2000000, - MNT_MARKED = 0x4000000, MNT_UMOUNT = 0x8000000, - /* - * MNT_SHARED_MASK is the set of flags that should be cleared when a - * mount becomes shared. Currently, this is only the flag that says a - * mount cannot be bind mounted, since this is how we create a mount - * that shares events with another mount. If you add a new MNT_* - * flag, consider how it interacts with shared mounts. - */ - MNT_SHARED_MASK = MNT_UNBINDABLE, MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | - MNT_LOCKED, + MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_SYNC_UMOUNT | MNT_LOCKED }; struct vfsmount { @@ -98,6 +85,7 @@ int mnt_get_write_access(struct vfsmount *mnt); void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); +extern struct vfsmount *fc_mount_longterm(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 482af449e00da..093551fe66a7e 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -483,7 +483,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); - mnt = fc_mount(fc); + mnt = fc_mount_longterm(fc); put_fs_context(fc); return mnt; } |