summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cgroup.c110
-rw-r--r--kernel/cgroup/cpuset.c3
-rw-r--r--kernel/cgroup/legacy_freezer.c23
-rw-r--r--kernel/cgroup/rstat.c48
5 files changed, 139 insertions, 47 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2ade21b54dc4..ff6a8099eb2a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ cpus_read_lock();
percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
break;
}
percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
mutex_unlock(&cgroup_mutex);
return retval;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ffaccd6373f1..b6e3110b3ea7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
@@ -2370,6 +2371,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2841,8 +2883,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2859,12 +2900,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2895,17 +2932,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2913,8 +2947,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -3000,8 +3034,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- if (has_tasks)
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(has_tasks);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3022,8 +3055,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- if (has_tasks)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3698,7 +3730,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
- new = psi_trigger_create(psi, buf, nbytes, res);
+ new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -4340,6 +4372,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4971,13 +5023,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5003,7 +5055,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -6017,6 +6069,9 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!kn)
goto out;
+ if (kernfs_type(kn) != KERNFS_DIR)
+ goto put;
+
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
@@ -6024,7 +6079,7 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-
+put:
kernfs_put(kn);
out:
return cgrp;
@@ -6132,11 +6187,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
return ERR_CAST(css);
cgrp = css->cgroup;
- if (!cgroup_on_dfl(cgrp)) {
- cgroup_put(cgrp);
- return ERR_PTR(-EBADF);
- }
-
return cgrp;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58aadfda9b8b..1f3a55297f39 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- cpus_read_lock();
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..1b6b21851e9d 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec(&freezer_active);
unfreeze_cgroup(freezer);
}
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index feb59380c896..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
+}
+late_initcall(bpf_rstat_kfunc_init);