summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c20
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/futex/core.c5
-rw-r--r--kernel/futex/futex.h11
-rw-r--r--kernel/futex/pi.c2
-rw-r--r--kernel/futex/waitwake.c4
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/core.c12
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/ext.c113
-rw-r--r--kernel/sched/ext.h4
-rw-r--r--kernel/sched/fair.c19
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/seccomp.c12
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c125
-rw-r--r--kernel/time/timer_migration.c10
-rw-r--r--kernel/trace/ring_buffer.c28
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/workqueue.c12
23 files changed, 296 insertions, 121 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9061bd55436..afc665b7b1fe 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 5877974ece92..aac91466279f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ebdd76b4ecbb..3db8567f5a44 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb)
futex_hb_waiters_dec(hb);
}
-void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
{
int prio;
@@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
- q->task = current;
+ q->task = task;
}
/**
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 99b32e728c4a..6b2f4c7eb720 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
}
extern void __futex_unqueue(struct futex_q *q);
-extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);
/**
* futex_queue() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
+ * @task: Task queueing this futex
*
* The hb->lock must be held by the caller, and is released here. A call to
* futex_queue() is typically paired with exactly one call to futex_unqueue(). The
@@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q);
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
+ *
+ * Note that @task may be NULL, for async usage of futexes.
*/
-static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
__releases(&hb->lock)
{
- __futex_queue(q, hb);
+ __futex_queue(q, hb, task);
spin_unlock(&hb->lock);
}
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index daea650b16f5..7a941845f7ee 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -982,7 +982,7 @@ retry_private:
/*
* Only actually queue now that the atomic ops are done:
*/
- __futex_queue(&q, hb);
+ __futex_queue(&q, hb, current);
if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index eb86a7ade06a..25877d4f2f8f 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -349,7 +349,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- futex_queue(q, hb);
+ futex_queue(q, hb, current);
/* Arm the timer */
if (timeout)
@@ -460,7 +460,7 @@ retry:
* next futex. Queue each futex at this moment so hb can
* be unlocked.
*/
- futex_queue(q, hb);
+ futex_queue(q, hb, current);
continue;
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5432418c0fea..875f25ed6f71 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK
config GENERIC_PENDING_IRQ
bool
-# Deduce delayed migration from top-level interrupt chip flags
-config GENERIC_PENDING_IRQ_CHIPFLAGS
- bool
-
# Support for generic irq migrating off cpu before the cpu is offline.
config GENERIC_IRQ_MIGRATION
bool
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4005b13ebd7f..5dc5b0d7238e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -859,7 +859,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
unsigned long flags;
- int ret;
+ int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
WARN_ON(1);
@@ -892,7 +892,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
out:
free_cpumask_var(affinity);
- return 0;
+ return ret;
}
/*
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 83d46b9b8ec8..2b331822c7e7 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
* see this thread after that: we can no longer use signal->autogroup.
* See the PF_EXITING check in task_wants_autogroup().
*/
- sched_move_task(p);
+ sched_move_task(p, true);
}
static void
@@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* sched_autogroup_exit_task().
*/
for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, true);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea..9aecd914ac69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1063,9 +1063,10 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- /* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
/*
* wake_up_process() executes a full barrier, which pairs with
@@ -9050,7 +9051,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -9079,7 +9080,8 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
- scx_move_task(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9180,7 +9182,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, false);
scx_cgroup_finish_attach();
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fd7e85220715..ef047add7f9e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1262,6 +1262,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
+ } else if (fair_policy(p->policy)) {
+ P(se.slice);
}
#ifdef CONFIG_SCHED_CLASS_EXT
__PS("ext.enabled", task_on_scx(p));
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8857c0709bdd..5a81d9a1e31f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -123,6 +123,19 @@ enum scx_ops_flags {
SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
* CPU cgroup support flags
*/
SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
@@ -130,6 +143,7 @@ enum scx_ops_flags {
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -416,7 +430,7 @@ struct sched_ext_ops {
/**
* @update_idle: Update the idle state of a CPU
- * @cpu: CPU to udpate the idle state for
+ * @cpu: CPU to update the idle state for
* @idle: whether entering or exiting the idle state
*
* This operation is called when @rq's CPU goes or leaves the idle
@@ -882,6 +896,7 @@ static bool scx_warned_zero_slice;
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
@@ -1214,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being interated
+ * @dsq: user dsq being iterated
* @cur: current position, %NULL to start iteration
* @rev: walk backwards
*
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
unlikely(p->flags & PF_EXITING))
goto local;
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ is_migration_disabled(p))
+ goto local;
+
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
/*
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
- * appened to the runnable_list.
+ * appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* - The BPF scheduler is bypassed while the rq is offline and we can always say
* no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
bool trigger_error)
{
int cpu = cpu_of(rq);
+ SCHED_WARN_ON(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (trigger_error)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
- cpu_of(rq), p->comm, p->pid);
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
- if (unlikely(is_migration_disabled(p)))
- return false;
-
if (!scx_rq_online(rq))
return false;
@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
/*
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
* banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artifical delays while the
+ * to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+#ifdef CONFIG_SMP
+ struct rq *locked_rq = rq;
+#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
#ifdef CONFIG_SMP
- if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dispatch_enqueue(find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
/* switch to @src_rq lock */
- if (rq != src_rq) {
- raw_spin_rq_unlock(rq);
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
raw_spin_rq_lock(src_rq);
}
@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
}
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
/* switch back to @rq lock */
- if (rq != dst_rq) {
- raw_spin_rq_unlock(dst_rq);
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
#else /* CONFIG_SMP */
@@ -3144,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
* to implement the default task ordering. The older the timestamp, the higher
- * prority the task - the global FIFO ordering matching the default scheduling
+ * priority the task - the global FIFO ordering matching the default scheduling
* behavior.
*
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3851,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
}
if (!curr->scx.slice)
@@ -3998,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -4027,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -4323,25 +4371,12 @@ err:
return ops_sanitize_err("cgroup_prep_move", ret);
}
-void scx_move_task(struct task_struct *p)
+void scx_cgroup_move_task(struct task_struct *p)
{
if (!scx_cgroup_enabled)
return;
/*
- * We're called from sched_move_task() which handles both cgroup and
- * autogroup moves. Ignore the latter.
- *
- * Also ignore exiting tasks, because in the exit path tasks transition
- * from the autogroup to the root group, so task_group_is_autogroup()
- * alone isn't able to catch exiting autogroup tasks. This is safe for
- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
- * tasks.
- */
- if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
- return;
-
- /*
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
@@ -4590,7 +4625,7 @@ static int scx_cgroup_init(void)
cgroup_warned_missing_idle = false;
/*
- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
*/
rcu_read_lock();
@@ -5059,6 +5094,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable(&scx_has_op[i]);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
static_branch_disable(&scx_builtin_idle_enabled);
synchronize_rcu();
@@ -5277,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
- dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu",
- p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
- p->scx.dsq_vtime, p->scx.slice);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
if (SCX_HAS_OP(dump_task)) {
@@ -5667,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
+ if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+ static_branch_enable(&scx_ops_enq_migration_disabled);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 4d022d17ac7d..1079b56b0f7a 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
-void scx_move_task(struct task_struct *p);
+void scx_cgroup_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
@@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle);
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
-static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce2e94ccad0c..1c0ef435a7aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5385,6 +5385,15 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void set_delayed(struct sched_entity *se)
{
se->sched_delayed = 1;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since dequeue_entities()
+ * will account it for blocked tasks.
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -5397,6 +5406,16 @@ static void set_delayed(struct sched_entity *se)
static void clear_delayed(struct sched_entity *se)
{
se->sched_delayed = 0;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since a dequeue has
+ * already accounted for it or an enqueue of a task
+ * below it will account for it in enqueue_task_fair().
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 38e0e323dda2..b93c8c3dc05a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg,
extern void sched_destroy_group(struct task_group *tg);
extern void sched_release_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f59381c4a2ff..7bbb408431eb 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
if (WARN_ON_ONCE(!fprog))
return false;
+ /* Our single exception to filtering. */
+#ifdef __NR_uretprobe
+#ifdef SECCOMP_ARCH_COMPAT
+ if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+ if (sd->nr == __NR_uretprobe)
+ return true;
+#endif
+
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
u16 code = insn->code;
@@ -1023,6 +1032,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+#ifdef __NR_uretprobe
+ __NR_uretprobe,
+#endif
-1, /* negative terminated */
};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7304d7cf47f2..2a7802ec480c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs)
cpumask_clear(&cpus_ahead);
cpumask_clear(&cpus_behind);
cpus_read_lock();
- preempt_disable();
+ migrate_disable();
clocksource_verify_choose_cpus();
if (cpumask_empty(&cpus_chosen)) {
- preempt_enable();
+ migrate_enable();
cpus_read_unlock();
pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
return;
}
testcpu = smp_processor_id();
- pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
+ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ preempt_disable();
for_each_cpu(cpu, &cpus_chosen) {
if (cpu == testcpu)
continue;
@@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
cs_nsec_min = cs_nsec;
}
preempt_enable();
+ migrate_enable();
cpus_read_unlock();
if (!cpumask_empty(&cpus_ahead))
pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f6d8df94045c..deb1aa32814e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -145,11 +156,6 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return base == &migration_base;
-}
-
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -183,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -254,8 +287,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -264,8 +297,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -275,11 +307,6 @@ again:
#else /* CONFIG_SMP */
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return false;
-}
-
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
__acquires(&timer->base->cpu_base->lock)
@@ -716,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1205,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1216,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1248,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
@@ -1370,6 +1421,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
}
}
+#ifdef CONFIG_SMP
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+#else
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+#endif
+
/*
* This function is called on PREEMPT_RT kernels when the fast path
* deletion of a timer failed because the timer callback function was
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 9cb9b6584ea1..2f6330831f08 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1675,6 +1675,9 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
} while (i < tmigr_hierarchy_levels);
+ /* Assert single root */
+ WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
+
while (i > 0) {
group = stack[--i];
@@ -1716,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
WARN_ON_ONCE(top == 0);
lvllist = &tmigr_level_list[top];
- if (group->num_children == 1 && list_is_singular(lvllist)) {
+
+ /*
+ * Newly created root level should have accounted the upcoming
+ * CPU's child group and pre-accounted the old root.
+ */
+ if (group->num_children == 2 && list_is_singular(lvllist)) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8e0ae15ca5b..bb6089c2951e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
* must be the same.
*/
static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages)
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
unsigned long buffers_end;
int i;
+ if (!subbuf_mask)
+ return false;
+
/* Check the meta magic and meta struct size */
if (meta->magic != RING_BUFFER_META_MAGIC ||
meta->struct_size != sizeof(*meta)) {
@@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
subbuf = rb_subbufs_from_meta(meta);
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
/* Is the meta buffers and the subbufs themselves have correct data? */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
@@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return false;
}
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
subbuf = (void *)subbuf + subbuf_size;
}
@@ -1838,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->cpu);
goto invalid;
}
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
local_set(&cpu_buffer->head_page->entries, ret);
@@ -1889,17 +1906,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
{
struct ring_buffer_meta *meta;
+ unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
int cpu;
int i;
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
@@ -1943,6 +1965,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf += meta->subbuf_size;
}
}
+ bitmap_free(subbuf_mask);
}
static void *rbm_start(struct seq_file *m, loff_t *pos)
@@ -7126,6 +7149,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
kfree(cpu_buffer->subbuf_ids);
cpu_buffer->subbuf_ids = NULL;
rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
}
unlock:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1496a5ac33ae..0e6d517e74e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5977,8 +5977,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
@@ -5987,11 +5985,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
-
- return ret;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
}
static void update_last_data(struct trace_array *tr)
@@ -8285,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
+ /* Currently the boot mapped buffer is not supported for mmap */
+ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
+ return -ENODEV;
+
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 54d850997c0a..136c750b0b4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- *task_var |= TRACE_GRAPH_NOTRACE_BIT;
+ *task_var |= TRACE_GRAPH_NOTRACE;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c2c45313c88..97152f2250fe 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3517,12 +3517,6 @@ repeat:
}
/*
- * Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're still attached to it.
- */
- put_pwq(pwq);
-
- /*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
*/
@@ -3532,6 +3526,12 @@ repeat:
worker_detach_from_pool(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool might
+ * go away any time after it.
+ */
+ put_pwq_unlocked(pwq);
+
raw_spin_lock_irq(&wq_mayday_lock);
}