summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpufreq.c37
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/fair.c26
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/sched.h48
8 files changed, 129 insertions, 5 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7d4cba227cbd..414d9c16da42 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
@@ -19,3 +23,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea8f49ae0062..44db0fffa8be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/frame.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2689,7 +2690,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
-static inline struct rq *
+static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -3174,7 +3175,7 @@ static void __sched notrace __schedule(bool preempt)
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
- to_wakeup = wq_worker_sleeping(prev, cpu);
+ to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
@@ -3204,6 +3205,7 @@ static void __sched notrace __schedule(bool preempt)
balance_callback(rq);
}
+STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
static inline void sched_submit_work(struct task_struct *tsk)
{
@@ -8441,7 +8443,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files,
- .early_init = 1,
+ .early_init = true,
};
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..2ddaebf7469a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..928c4ba32f68
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,37 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ *
+ * Set and publish the update_util_data pointer for the given CPU. That pointer
+ * points to a struct update_util_data object containing a callback function
+ * to call from cpufreq_update_util(). That function will be called from an RCU
+ * read-side critical section, so it must not sleep.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+{
+ if (WARN_ON(data && !data->func))
+ return;
+
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036facbe1..affd97ec9f65 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -717,6 +717,10 @@ static void update_curr_dl(struct rq *rq)
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33130529e9b5..46d64e4ccfde 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2856,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
- int cpu = cpu_of(rq_of(cfs_rq));
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Track task load average for carrying it to new CPU after migrated, and
@@ -2868,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
+
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ unsigned long max = rq->cpu_capacity_orig;
+
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_clock(rq),
+ min(cfs_rq->avg.util_avg, max), max);
+ }
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 562471329487..c41ea7ac1764 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -953,6 +953,10 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2ff5a2bd6df..382848a24ed9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1793,3 +1793,51 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ */
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, time, util, max);
+}
+
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+ cpufreq_update_util(time, ULONG_MAX, 0);
+}
+#else
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif /* CONFIG_CPU_FREQ */