/*
* Copyright (c) 2012-2018 Richard Braun.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*
* The scheduling algorithm implemented by this module, named Distributed
* Group Ratio Round-Robin (DGR3), is based on the following papers :
* - "Group Ratio Round-Robin: O(1) Proportional Share Scheduling for
* Uniprocessor and Multiprocessor Systems" by Bogdan Caprita, Wong Chun
* Chan, Jason Nieh, Clifford Stein and Haoqiang Zheng.
* - "Efficient and Scalable Multiprocessor Fair Scheduling Using Distributed
* Weighted Round-Robin" by Tong li, Dan Baumberger and Scott Hahn.
*
* Note that the Group Ratio Round-Robin (GR3) paper offers a multiprocessor
* extension, but based on a single global queue, which strongly limits its
* scalability on systems with many processors. That extension isn't used in
* this implementation.
*
* The basic idea is to use GR3 for processor-local scheduling, and Distributed
* Weighted Round-Robin (DWRR) for inter-processor load balancing. These
* algorithms were chosen for several reasons. To begin with, they provide
* fair scheduling, a very desirable property for a modern scheduler. Next,
* being based on round-robin, their algorithmic complexity is very low (GR3
* has O(1) scheduling complexity, and O(g) complexity on thread addition
* or removal, g being the number of groups, with one group per priority, a
* low number in practice). Finally, they're very simple to implement, making
* them easy to adjust and maintain.
*
* Both algorithms are actually slightly modified for efficiency. First, this
* version of GR3 is simplified by mapping exactly one group to one priority,
* and in turn, one weight. This is possible because priorities are intended
* to match Unix nice values, and systems commonly provide a constant, small
* set of nice values. This removes the need for accounting deficit. Next,
* round tracking is used to improve the handling of dynamic events : work
* scaling is performed only on thread addition, and not when a thread that
* was removed is added again during the same round. In addition, since GR3
* is itself a round-robin algorithm, it already provides the feature required
* from local scheduling by DWRR, namely round slicing. Consequently, DWRR
* doesn't sit "on top" of GR3, but is actually merged with it. The result is
* an algorithm that shares the same data for both local scheduling and load
* balancing.
*
* A few terms are used by both papers with slightly different meanings. Here
* are the definitions used in this implementation :
* - The time unit is the system timer period (1 / tick frequency)
* - Work is the amount of execution time units consumed
* - Weight is the amount of execution time units allocated
* - A round is the shortest period during which all threads in a run queue
* consume their allocated time (i.e. their work reaches their weight)
*
* TODO Sub-tick accounting.
*
* TODO Setting affinity after thread creation.
*
* TODO Take into account the underlying CPU topology (and adjust load
* balancing to access the global highest round less frequently on large
* processor groups, perhaps by applying the load balancing algorithm in a
* bottom-up fashion with one highest round per processor group).
*
* TODO For now, interactivity can not be experimented. The current strategy
* is to always add threads in front of their group queue and track rounds
* so that they don't get more time than they should. A direct consequence
* is that continually spawning threads at short intervals is likely to cause
* starvation. This could be fixed by adding newly created threads at the back
* of their group queue. For now, don't overengineer, and wait until all this
* can actually be tested.
*
* TODO Review weight computation (it may be more appropriate to determine
* weights in a smoother way than a raw scaling).
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
* Preemption level of a suspended thread.
*
* The expected interrupt, preemption and run queue lock state when
* dispatching a thread is :
* - interrupts disabled
* - preemption disabled
* - run queue locked
*
* Locking the run queue increases the preemption level once more,
* making its value 2.
*/
#define THREAD_SUSPEND_PREEMPT_LEVEL 2
/*
* Scheduling classes.
*
* Classes are sorted by order of priority (lower indexes first). The same
* class can apply to several policies.
*
* The idle class is reserved for the per-CPU idle threads.
*/
#define THREAD_SCHED_CLASS_RT 0
#define THREAD_SCHED_CLASS_FS 1
#define THREAD_SCHED_CLASS_IDLE 2
#define THREAD_NR_SCHED_CLASSES 3
/*
* Global priority bases for each scheduling class.
*
* Global priorities are only used to determine which of two threads
* has the higher priority, and should only matter for priority
* inheritance.
*
* In the current configuration, all fair-scheduling threads have the
* same global priority.
*/
#define THREAD_SCHED_GLOBAL_PRIO_RT 2
#define THREAD_SCHED_GLOBAL_PRIO_FS 1
#define THREAD_SCHED_GLOBAL_PRIO_IDLE 0
/*
* Default time slice for real-time round-robin scheduling.
*/
#define THREAD_DEFAULT_RR_TIME_SLICE (CLOCK_FREQ / 10)
/*
* Maximum number of threads which can be pulled from a remote run queue
* while interrupts are disabled.
*/
#define THREAD_MAX_MIGRATIONS 16
/*
* Delay (in ticks) between two balance attempts when a run queue is idle.
*/
#define THREAD_IDLE_BALANCE_TICKS (CLOCK_FREQ / 2)
/*
* Run queue properties for real-time threads.
*/
struct thread_rt_runq {
unsigned int bitmap;
struct list threads[THREAD_SCHED_RT_PRIO_MAX + 1];
};
/*
* Initial value of the highest round.
*
* Set to a high value to make sure overflows are correctly handled.
*/
#define THREAD_FS_INITIAL_ROUND ((unsigned long)-10)
/*
* Round slice base unit for fair-scheduling threads.
*/
#define THREAD_FS_ROUND_SLICE_BASE (CLOCK_FREQ / 10)
/*
* Group of threads sharing the same weight.
*/
struct thread_fs_group {
struct list node;
struct list threads;
unsigned int weight;
unsigned int work;
};
/*
* Run queue properties for fair-scheduling threads.
*
* The current group pointer has a valid address only when the run queue isn't
* empty.
*/
struct thread_fs_runq {
struct thread_fs_group group_array[THREAD_SCHED_FS_PRIO_MAX + 1];
struct list groups;
struct list threads;
struct thread_fs_group *current;
unsigned int nr_threads;
unsigned int weight;
unsigned int work;
};
/*
* Per processor run queue.
*
* Locking multiple run queues is done in the ascending order of their CPU
* identifier. Interrupts must be disabled whenever locking a run queue, even
* a remote one, otherwise an interrupt (which invokes the scheduler on its
* return path) may violate the locking order.
*/
struct thread_runq {
alignas(CPU_L1_SIZE) struct spinlock lock;
unsigned int cpu;
unsigned int nr_threads;
struct thread *current;
/* Real-time related members */
struct thread_rt_runq rt_runq;
/*
* Fair-scheduling related members.
*
* The current round is set when the active run queue becomes non-empty.
* It's not reset when both run queues become empty. As a result, the
* current round has a meaningful value only when at least one thread is
* present, i.e. the global weight isn't zero.
*/
unsigned long fs_round;
unsigned int fs_weight;
struct thread_fs_runq fs_runqs[2];
struct thread_fs_runq *fs_runq_active;
struct thread_fs_runq *fs_runq_expired;
struct thread *balancer;
struct thread *idler;
/* Ticks before the next balancing attempt when a run queue is idle */
unsigned int idle_balance_ticks;
struct syscnt sc_schedule_intrs;
struct syscnt sc_boosts;
};
/*
* Operations of a scheduling class.
*/
struct thread_sched_ops {
struct thread_runq * (*select_runq)(struct thread *thread);
void (*add)(struct thread_runq *runq, struct thread *thread);
void (*remove)(struct thread_runq *runq, struct thread *thread);
void (*put_prev)(struct thread_runq *runq, struct thread *thread);
struct thread * (*get_next)(struct thread_runq *runq);
void (*reset_priority)(struct thread *thread, unsigned short priority);
void (*update_priority)(struct thread *thread, unsigned short priority);
unsigned int (*get_global_priority)(unsigned short priority);
void (*set_next)(struct thread_runq *runq, struct thread *thread);
void (*tick)(struct thread_runq *runq, struct thread *thread);
};
static struct thread_runq thread_runq __percpu;
/*
* Statically allocated fake threads that provide thread context to processors
* during bootstrap.
*/
static struct thread thread_booters[CONFIG_MAX_CPUS] __initdata;
static struct kmem_cache thread_cache;
#ifndef CONFIG_THREAD_STACK_GUARD
static struct kmem_cache thread_stack_cache;
#endif /* CONFIG_THREAD_STACK_GUARD */
static const unsigned char thread_policy_table[THREAD_NR_SCHED_POLICIES] = {
[THREAD_SCHED_POLICY_FIFO] = THREAD_SCHED_CLASS_RT,
[THREAD_SCHED_POLICY_RR] = THREAD_SCHED_CLASS_RT,
[THREAD_SCHED_POLICY_FS] = THREAD_SCHED_CLASS_FS,
[THREAD_SCHED_POLICY_IDLE] = THREAD_SCHED_CLASS_IDLE,
};
static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES];
/*
* Map of run queues for which a processor is running.
*/
static struct cpumap thread_active_runqs;
/*
* Map of idle run queues.
*
* Access to this map isn't synchronized. It is merely used as a fast hint
* to find run queues that are likely to be idle.
*/
static struct cpumap thread_idle_runqs;
/*
* System-wide value of the current highest round.
*
* This global variable is accessed without any synchronization. Its value
* being slightly inaccurate doesn't harm the fairness properties of the
* scheduling and load balancing algorithms.
*
* There can be moderate bouncing on this word so give it its own cache line.
*/
static struct {
alignas(CPU_L1_SIZE) volatile unsigned long value;
} thread_fs_highest_round_struct;
#define thread_fs_highest_round (thread_fs_highest_round_struct.value)
/*
* Number of TSD keys actually allocated.
*/
static unsigned int thread_nr_keys __read_mostly;
/*
* Destructors installed for each key.
*/
static thread_dtor_fn_t thread_dtors[THREAD_KEYS_MAX] __read_mostly;
/*
* Number of processors which have requested the scheduler to run.
*
* This value is used to implement a global barrier across the entire
* system at boot time, so that inter-processor requests may not be
* lost in case a processor is slower to initialize.
*/
static unsigned int thread_nr_boot_cpus __initdata;
struct thread_zombie {
struct work work;
struct thread *thread;
};
static unsigned char
thread_policy_to_class(unsigned char policy)
{
assert(policy < ARRAY_SIZE(thread_policy_table));
return thread_policy_table[policy];
}
static void
thread_set_wchan(struct thread *thread, const void *wchan_addr,
const char *wchan_desc)
{
assert((wchan_addr != NULL) && (wchan_desc != NULL));
thread->wchan_addr = wchan_addr;
thread->wchan_desc = wchan_desc;
}
static void
thread_clear_wchan(struct thread *thread)
{
thread->wchan_addr = NULL;
thread->wchan_desc = NULL;
}
static const struct thread_sched_ops *
thread_get_sched_ops(unsigned char sched_class)
{
assert(sched_class < ARRAY_SIZE(thread_sched_ops));
return &thread_sched_ops[sched_class];
}
static const struct thread_sched_ops *
thread_get_user_sched_ops(const struct thread *thread)
{
return thread_get_sched_ops(thread_user_sched_class(thread));
}
static const struct thread_sched_ops *
thread_get_real_sched_ops(const struct thread *thread)
{
return thread_get_sched_ops(thread_real_sched_class(thread));
}
static void __init
thread_runq_init_rt(struct thread_runq *runq)
{
struct thread_rt_runq *rt_runq;
size_t i;
rt_runq = &runq->rt_runq;
rt_runq->bitmap = 0;
for (i = 0; i < ARRAY_SIZE(rt_runq->threads); i++) {
list_init(&rt_runq->threads[i]);
}
}
static void __init
thread_fs_group_init(struct thread_fs_group *group)
{
list_init(&group->threads);
group->weight = 0;
group->work = 0;
}
static void __init
thread_fs_runq_init(struct thread_fs_runq *fs_runq)
{
size_t i;
for (i = 0; i < ARRAY_SIZE(fs_runq->group_array); i++) {
thread_fs_group_init(&fs_runq->group_array[i]);
}
list_init(&fs_runq->groups);
list_init(&fs_runq->threads);
fs_runq->nr_threads = 0;
fs_runq->weight = 0;
fs_runq->work = 0;
}
static void __init
thread_runq_init_fs(struct thread_runq *runq)
{
runq->fs_weight = 0;
runq->fs_runq_active = &runq->fs_runqs[0];
runq->fs_runq_expired = &runq->fs_runqs[1];
thread_fs_runq_init(runq->fs_runq_active);
thread_fs_runq_init(runq->fs_runq_expired);
}
static void __init
thread_runq_init(struct thread_runq *runq, unsigned int cpu,
struct thread *booter)
{
char name[SYSCNT_NAME_SIZE];
spinlock_init(&runq->lock);
runq->cpu = cpu;
runq->nr_threads = 0;
runq->current = booter;
thread_runq_init_rt(runq);
thread_runq_init_fs(runq);
runq->balancer = NULL;
runq->idler = NULL;
runq->idle_balance_ticks = (unsigned int)-1;
snprintf(name, sizeof(name), "thread_schedule_intrs/%u", cpu);
syscnt_register(&runq->sc_schedule_intrs, name);
snprintf(name, sizeof(name), "thread_boosts/%u", cpu);
syscnt_register(&runq->sc_boosts, name);
}
static inline struct thread_runq *
thread_runq_local(void)
{
assert(!thread_preempt_enabled() || thread_pinned());
return cpu_local_ptr(thread_runq);
}
static inline unsigned int
thread_runq_cpu(struct thread_runq *runq)
{
return runq->cpu;
}
static void
thread_runq_add(struct thread_runq *runq, struct thread *thread)
{
const struct thread_sched_ops *ops;
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
assert(!thread->in_runq);
ops = thread_get_real_sched_ops(thread);
ops->add(runq, thread);
if (runq->nr_threads == 0) {
cpumap_clear_atomic(&thread_idle_runqs, thread_runq_cpu(runq));
}
runq->nr_threads++;
if (thread_real_sched_class(thread)
< thread_real_sched_class(runq->current)) {
thread_set_flag(runq->current, THREAD_YIELD);
}
atomic_store(&thread->runq, runq, ATOMIC_RELAXED);
thread->in_runq = true;
}
static void
thread_runq_remove(struct thread_runq *runq, struct thread *thread)
{
const struct thread_sched_ops *ops;
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
assert(thread->in_runq);
runq->nr_threads--;
if (runq->nr_threads == 0) {
cpumap_set_atomic(&thread_idle_runqs, thread_runq_cpu(runq));
}
ops = thread_get_real_sched_ops(thread);
ops->remove(runq, thread);
thread->in_runq = false;
}
static void
thread_runq_put_prev(struct thread_runq *runq, struct thread *thread)
{
const struct thread_sched_ops *ops;
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
ops = thread_get_real_sched_ops(thread);
if (ops->put_prev != NULL) {
ops->put_prev(runq, thread);
}
}
static struct thread *
thread_runq_get_next(struct thread_runq *runq)
{
struct thread *thread;
unsigned int i;
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
for (i = 0; i < ARRAY_SIZE(thread_sched_ops); i++) {
thread = thread_sched_ops[i].get_next(runq);
if (thread != NULL) {
atomic_store(&runq->current, thread, ATOMIC_RELAXED);
return thread;
}
}
/* The idle class should never be empty */
panic("thread: unable to find next thread");
}
static void
thread_runq_set_next(struct thread_runq *runq, struct thread *thread)
{
const struct thread_sched_ops *ops;
ops = thread_get_real_sched_ops(thread);
if (ops->set_next != NULL) {
ops->set_next(runq, thread);
}
atomic_store(&runq->current, thread, ATOMIC_RELAXED);
}
static void
thread_runq_wakeup(struct thread_runq *runq, struct thread *thread)
{
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
assert(thread->state == THREAD_RUNNING);
thread_runq_add(runq, thread);
if ((runq != thread_runq_local())
&& thread_test_flag(runq->current, THREAD_YIELD)) {
cpu_send_thread_schedule(thread_runq_cpu(runq));
}
}
static void
thread_runq_wakeup_balancer(struct thread_runq *runq)
{
if (runq->balancer->state == THREAD_RUNNING) {
return;
}
thread_clear_wchan(runq->balancer);
runq->balancer->state = THREAD_RUNNING;
thread_runq_wakeup(runq, runq->balancer);
}
static void
thread_runq_schedule_load(struct thread *thread)
{
pmap_load(thread->task->map->pmap);
#ifdef CONFIG_PERFMON
perfmon_thread_load(thread);
#endif
}
static void
thread_runq_schedule_unload(struct thread *thread)
{
#ifdef CONFIG_PERFMON
perfmon_thread_unload(thread);
#else
(void)thread;
#endif
}
static struct thread_runq *
thread_runq_schedule(struct thread_runq *runq)
{
struct thread *prev, *next;
prev = thread_self();
assert((__builtin_frame_address(0) >= prev->stack)
&& (__builtin_frame_address(0) < (prev->stack + TCB_STACK_SIZE)));
assert(prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
thread_clear_flag(prev, THREAD_YIELD);
thread_runq_put_prev(runq, prev);
if (prev->state != THREAD_RUNNING) {
thread_runq_remove(runq, prev);
if ((runq->nr_threads == 0) && (prev != runq->balancer)) {
thread_runq_wakeup_balancer(runq);
}
}
next = thread_runq_get_next(runq);
assert((next != runq->idler) || (runq->nr_threads == 0));
assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
if (likely(prev != next)) {
thread_runq_schedule_unload(prev);
rcu_report_context_switch(thread_rcu_reader(prev));
spinlock_transfer_owner(&runq->lock, next);
/*
* That's where the true context switch occurs. The next thread must
* unlock the run queue and reenable preemption. Note that unlocking
* and locking the run queue again is equivalent to a full memory
* barrier.
*/
tcb_switch(&prev->tcb, &next->tcb);
/*
* The thread is dispatched on a processor once again.
*
* Keep in mind the system state may have changed a lot since this
* function was called. In particular :
* - The next thread may have been destroyed, and must not be
* referenced any more.
* - The current thread may have been migrated to another processor.
*/
barrier();
thread_runq_schedule_load(prev);
next = NULL;
runq = thread_runq_local();
} else {
next = NULL;
}
assert(prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
assert(!cpu_intr_enabled());
spinlock_assert_locked(&runq->lock);
return runq;
}
static void
thread_runq_double_lock(struct thread_runq *a, struct thread_runq *b)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
assert(a != b);
if (a->cpu < b->cpu) {
spinlock_lock(&a->lock);
spinlock_lock(&b->lock);
} else {
spinlock_lock(&b->lock);
spinlock_lock(&a->lock);
}
}
static struct thread_runq *
thread_sched_rt_select_runq(struct thread *thread)
{
struct thread_runq *runq;
int i;
/*
* Real-time tasks are commonly configured to run on one specific
* processor only.
*/
i = cpumap_find_first(&thread->cpumap);
assert(i >= 0);
assert(cpumap_test(&thread_active_runqs, i));
runq = percpu_ptr(thread_runq, i);
spinlock_lock(&runq->lock);
return runq;
}
static void
thread_sched_rt_add(struct thread_runq *runq, struct thread *thread)
{
struct thread_rt_runq *rt_runq;
struct list *threads;
rt_runq = &runq->rt_runq;
threads = &rt_runq->threads[thread_real_priority(thread)];
list_insert_tail(threads, &thread->rt_data.node);
if (list_singular(threads)) {
rt_runq->bitmap |= (1ULL << thread_real_priority(thread));
}
if ((thread_real_sched_class(thread)
== thread_real_sched_class(runq->current))
&& (thread_real_priority(thread)
> thread_real_priority(runq->current))) {
thread_set_flag(runq->current, THREAD_YIELD);
}
}
static void
thread_sched_rt_remove(struct thread_runq *runq, struct thread *thread)
{
struct thread_rt_runq *rt_runq;
struct list *threads;
rt_runq = &runq->rt_runq;
threads = &rt_runq->threads[thread_real_priority(thread)];
list_remove(&thread->rt_data.node);
if (list_empty(threads)) {
rt_runq->bitmap &= ~(1ULL << thread_real_priority(thread));
}
}
static void
thread_sched_rt_put_prev(struct thread_runq *runq, struct thread *thread)
{
thread_sched_rt_add(runq, thread);
}
static struct thread *
thread_sched_rt_get_next(struct thread_runq *runq)
{
struct thread_rt_runq *rt_runq;
struct thread *thread;
struct list *threads;
unsigned int priority;
rt_runq = &runq->rt_runq;
if (rt_runq->bitmap == 0) {
return NULL;
}
priority = THREAD_SCHED_RT_PRIO_MAX - __builtin_clz(rt_runq->bitmap);
threads = &rt_runq->threads[priority];
assert(!list_empty(threads));
thread = list_first_entry(threads, struct thread, rt_data.node);
thread_sched_rt_remove(runq, thread);
return thread;
}
static void
thread_sched_rt_reset_priority(struct thread *thread, unsigned short priority)
{
assert(priority <= THREAD_SCHED_RT_PRIO_MAX);
thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
}
static unsigned int
thread_sched_rt_get_global_priority(unsigned short priority)
{
return THREAD_SCHED_GLOBAL_PRIO_RT + priority;
}
static void
thread_sched_rt_set_next(struct thread_runq *runq, struct thread *thread)
{
thread_sched_rt_remove(runq, thread);
}
static void
thread_sched_rt_tick(struct thread_runq *runq, struct thread *thread)
{
(void)runq;
if (thread_real_sched_policy(thread) != THREAD_SCHED_POLICY_RR) {
return;
}
thread->rt_data.time_slice--;
if (thread->rt_data.time_slice > 0) {
return;
}
thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
thread_set_flag(thread, THREAD_YIELD);
}
static inline unsigned short
thread_sched_fs_prio2weight(unsigned short priority)
{
return ((priority + 1) * THREAD_FS_ROUND_SLICE_BASE);
}
static struct thread_runq *
thread_sched_fs_select_runq(struct thread *thread)
{
struct thread_runq *runq, *tmp;
long delta;
int i;
cpumap_for_each(&thread_idle_runqs, i) {
if (!cpumap_test(&thread->cpumap, i)) {
continue;
}
runq = percpu_ptr(thread_runq, i);
spinlock_lock(&runq->lock);
/* The run queue really is idle, return it */
if (runq->current == runq->idler) {
goto out;
}
spinlock_unlock(&runq->lock);
}
runq = NULL;
cpumap_for_each(&thread_active_runqs, i) {
if (!cpumap_test(&thread->cpumap, i)) {
continue;
}
tmp = percpu_ptr(thread_runq, i);
spinlock_lock(&tmp->lock);
if (runq == NULL) {
runq = tmp;
continue;
}
/* A run queue may have become idle */
if (tmp->current == tmp->idler) {
spinlock_unlock(&runq->lock);
runq = tmp;
goto out;
}
/*
* The run queue isn't idle, but there are no fair-scheduling thread,
* which means there are real-time threads.
*/
if (tmp->fs_weight == 0) {
spinlock_unlock(&tmp->lock);
continue;
}
delta = (long)(tmp->fs_round - runq->fs_round);
/* Look for the least loaded of the run queues in the highest round */
if ((delta > 0)
|| ((delta == 0) && (tmp->fs_weight < runq->fs_weight))) {
spinlock_unlock(&runq->lock);
runq = tmp;
continue;
}
spinlock_unlock(&tmp->lock);
}
assert(runq != NULL);
out:
return runq;
}
static unsigned int
thread_sched_fs_enqueue_scale(unsigned int work, unsigned int old_weight,
unsigned int new_weight)
{
assert(old_weight != 0);
#ifndef __LP64__
if (likely((work < 0x10000) && (new_weight < 0x10000))) {
return (work * new_weight) / old_weight;
}
#endif /* __LP64__ */
return (unsigned int)(((unsigned long long)work * new_weight) / old_weight);
}
static void
thread_sched_fs_enqueue(struct thread_fs_runq *fs_runq, unsigned long round,
struct thread *thread)
{
struct thread_fs_group *group, *tmp;
struct list *node, *init_node;
unsigned int group_weight, total_weight;
assert(thread->fs_data.fs_runq == NULL);
assert(thread->fs_data.work <= thread->fs_data.weight);
group = &fs_runq->group_array[thread_real_priority(thread)];
group_weight = group->weight + thread->fs_data.weight;
total_weight = fs_runq->weight + thread->fs_data.weight;
node = (group->weight == 0)
? list_last(&fs_runq->groups)
: list_prev(&group->node);
init_node = node;
while (!list_end(&fs_runq->groups, node)) {
tmp = list_entry(node, struct thread_fs_group, node);
if (tmp->weight >= group_weight) {
break;
}
node = list_prev(node);
}
if (group->weight == 0) {
list_insert_after(&group->node, node);
} else if (node != init_node) {
list_remove(&group->node);
list_insert_after(&group->node, node);
}
/*
* XXX Unfairness can occur if the run queue round wraps around and the
* thread is "lucky" enough to have the same round value. This should be
* rare and harmless otherwise.
*/
if (thread->fs_data.round == round) {
fs_runq->work += thread->fs_data.work;
group->work += thread->fs_data.work;
} else {
unsigned int group_work, thread_work;
if (fs_runq->weight == 0) {
thread_work = 0;
} else {
group_work = (group->weight == 0)
? thread_sched_fs_enqueue_scale(fs_runq->work,
fs_runq->weight,
thread->fs_data.weight)
: thread_sched_fs_enqueue_scale(group->work,
group->weight,
group_weight);
thread_work = group_work - group->work;
fs_runq->work += thread_work;
group->work = group_work;
}
thread->fs_data.round = round;
thread->fs_data.work = thread_work;
}
fs_runq->nr_threads++;
fs_runq->weight = total_weight;
group->weight = group_weight;
/* Insert at the front of the group to improve interactivity */
list_insert_head(&group->threads, &thread->fs_data.group_node);
list_insert_tail(&fs_runq->threads, &thread->fs_data.runq_node);
thread->fs_data.fs_runq = fs_runq;
}
static void
thread_sched_fs_restart(struct thread_runq *runq)
{
struct thread_fs_runq *fs_runq;
struct list *node;
fs_runq = runq->fs_runq_active;
node = list_first(&fs_runq->groups);
assert(node != NULL);
fs_runq->current = list_entry(node, struct thread_fs_group, node);
if (thread_real_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) {
thread_set_flag(runq->current, THREAD_YIELD);
}
}
static void
thread_sched_fs_add(struct thread_runq *runq, struct thread *thread)
{
unsigned int total_weight;
if (runq->fs_weight == 0) {
runq->fs_round = thread_fs_highest_round;
}
total_weight = runq->fs_weight + thread->fs_data.weight;
/* TODO Limit the maximum number of threads to prevent this situation */
if (total_weight < runq->fs_weight) {
panic("thread: weight overflow");
}
runq->fs_weight = total_weight;
thread_sched_fs_enqueue(runq->fs_runq_active, runq->fs_round, thread);
thread_sched_fs_restart(runq);
}
static void
thread_sched_fs_dequeue(struct thread *thread)
{
struct thread_fs_runq *fs_runq;
struct thread_fs_group *group, *tmp;
struct list *node, *init_node;
assert(thread->fs_data.fs_runq != NULL);
fs_runq = thread->fs_data.fs_runq;
group = &fs_runq->group_array[thread_real_priority(thread)];
thread->fs_data.fs_runq = NULL;
list_remove(&thread->fs_data.runq_node);
list_remove(&thread->fs_data.group_node);
fs_runq->work -= thread->fs_data.work;
group->work -= thread->fs_data.work;
fs_runq->weight -= thread->fs_data.weight;
group->weight -= thread->fs_data.weight;
fs_runq->nr_threads--;
if (group->weight == 0) {
list_remove(&group->node);
} else {
node = list_next(&group->node);
init_node = node;
while (!list_end(&fs_runq->groups, node)) {
tmp = list_entry(node, struct thread_fs_group, node);
if (tmp->weight <= group->weight) {
break;
}
node = list_next(node);
}
if (node != init_node) {
list_remove(&group->node);
list_insert_before(&group->node, node);
}
}
}
static void
thread_sched_fs_remove(struct thread_runq *runq, struct thread *thread)
{
struct thread_fs_runq *fs_runq;
runq->fs_weight -= thread->fs_data.weight;
fs_runq = thread->fs_data.fs_runq;
thread_sched_fs_dequeue(thread);
if (fs_runq == runq->fs_runq_active) {
if (fs_runq->nr_threads == 0) {
thread_runq_wakeup_balancer(runq);
} else {
thread_sched_fs_restart(runq);
}
}
}
static void
thread_sched_fs_deactivate(struct thread_runq *runq, struct thread *thread)
{
assert(thread->fs_data.fs_runq == runq->fs_runq_active);
assert(thread->fs_data.round == runq->fs_round);
thread_sched_fs_dequeue(thread);
thread->fs_data.round++;
thread->fs_data.work -= thread->fs_data.weight;
thread_sched_fs_enqueue(runq->fs_runq_expired, runq->fs_round + 1, thread);
if (runq->fs_runq_active->nr_threads == 0) {
thread_runq_wakeup_balancer(runq);
}
}
static void
thread_sched_fs_put_prev(struct thread_runq *runq, struct thread *thread)
{
struct thread_fs_runq *fs_runq;
struct thread_fs_group *group;
fs_runq = runq->fs_runq_active;
group = &fs_runq->group_array[thread_real_priority(thread)];
list_insert_tail(&group->threads, &thread->fs_data.group_node);
if (thread->fs_data.work >= thread->fs_data.weight) {
thread_sched_fs_deactivate(runq, thread);
}
}
static int
thread_sched_fs_ratio_exceeded(struct thread_fs_group *current,
struct thread_fs_group *next)
{
unsigned long long a, b;
#ifndef __LP64__
unsigned int ia, ib;
if (likely((current->weight < 0x10000) && (next->weight < 0x10000))) {
ia = (current->work + 1) * next->weight;
ib = (next->work + 1) * current->weight;
return ia > ib;
}
#endif /* __LP64__ */
a = ((unsigned long long)current->work + 1) * next->weight;
b = ((unsigned long long)next->work + 1) * current->weight;
return a > b;
}
static struct thread *
thread_sched_fs_get_next(struct thread_runq *runq)
{
struct thread_fs_runq *fs_runq;
struct thread_fs_group *group, *next;
struct thread *thread;
struct list *node;
fs_runq = runq->fs_runq_active;
if (fs_runq->nr_threads == 0) {
return NULL;
}
group = fs_runq->current;
node = list_next(&group->node);
if (list_end(&fs_runq->groups, node)) {
node = list_first(&fs_runq->groups);
group = list_entry(node, struct thread_fs_group, node);
} else {
next = list_entry(node, struct thread_fs_group, node);
if (thread_sched_fs_ratio_exceeded(group, next)) {
group = next;
} else {
node = list_first(&fs_runq->groups);
group = list_entry(node, struct thread_fs_group, node);
}
}
fs_runq->current = group;
node = list_first(&group->threads);
thread = list_entry(node, struct thread, fs_data.group_node);
list_remove(node);
return thread;
}
static void
thread_sched_fs_reset_priority(struct thread *thread, unsigned short priority)
{
assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
thread->fs_data.fs_runq = NULL;
thread->fs_data.round = 0;
thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
thread->fs_data.work = 0;
}
static void
thread_sched_fs_update_priority(struct thread *thread, unsigned short priority)
{
assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
if (thread->fs_data.work >= thread->fs_data.weight) {
thread->fs_data.work = thread->fs_data.weight;
}
}
static unsigned int
thread_sched_fs_get_global_priority(unsigned short priority)
{
(void)priority;
return THREAD_SCHED_GLOBAL_PRIO_FS;
}
static void
thread_sched_fs_set_next(struct thread_runq *runq, struct thread *thread)
{
(void)runq;
list_remove(&thread->fs_data.group_node);
}
static void
thread_sched_fs_tick(struct thread_runq *runq, struct thread *thread)
{
struct thread_fs_runq *fs_runq;
struct thread_fs_group *group;
fs_runq = runq->fs_runq_active;
fs_runq->work++;
group = &fs_runq->group_array[thread_real_priority(thread)];
group->work++;
thread_set_flag(thread, THREAD_YIELD);
thread->fs_data.work++;
}
static void
thread_sched_fs_start_next_round(struct thread_runq *runq)
{
struct thread_fs_runq *tmp;
long delta;
tmp = runq->fs_runq_expired;
runq->fs_runq_expired = runq->fs_runq_active;
runq->fs_runq_active = tmp;
if (runq->fs_runq_active->nr_threads != 0) {
runq->fs_round++;
delta = (long)(runq->fs_round - thread_fs_highest_round);
if (delta > 0) {
thread_fs_highest_round = runq->fs_round;
}
thread_sched_fs_restart(runq);
}
}
/*
* Check that a remote run queue satisfies the minimum migration requirements.
*/
static int
thread_sched_fs_balance_eligible(struct thread_runq *runq,
unsigned long highest_round)
{
unsigned int nr_threads;
if (runq->fs_weight == 0) {
return 0;
}
if ((runq->fs_round != highest_round)
&& (runq->fs_round != (highest_round - 1))) {
return 0;
}
nr_threads = runq->fs_runq_active->nr_threads
+ runq->fs_runq_expired->nr_threads;
if ((nr_threads == 0)
|| ((nr_threads == 1)
&& (thread_real_sched_class(runq->current)
== THREAD_SCHED_CLASS_FS))) {
return 0;
}
return 1;
}
/*
* Try to find the most suitable run queue from which to pull threads.
*/
static struct thread_runq *
thread_sched_fs_balance_scan(struct thread_runq *runq,
unsigned long highest_round)
{
struct thread_runq *remote_runq, *tmp;
unsigned long flags;
int i;
remote_runq = NULL;
thread_preempt_disable_intr_save(&flags);
cpumap_for_each(&thread_active_runqs, i) {
tmp = percpu_ptr(thread_runq, i);
if (tmp == runq) {
continue;
}
spinlock_lock(&tmp->lock);
if (!thread_sched_fs_balance_eligible(tmp, highest_round)) {
spinlock_unlock(&tmp->lock);
continue;
}
if (remote_runq == NULL) {
remote_runq = tmp;
continue;
}
if (tmp->fs_weight > remote_runq->fs_weight) {
spinlock_unlock(&remote_runq->lock);
remote_runq = tmp;
continue;
}
spinlock_unlock(&tmp->lock);
}
if (remote_runq != NULL) {
spinlock_unlock(&remote_runq->lock);
}
thread_preempt_enable_intr_restore(flags);
return remote_runq;
}
static unsigned int
thread_sched_fs_balance_pull(struct thread_runq *runq,
struct thread_runq *remote_runq,
struct thread_fs_runq *fs_runq,
unsigned int nr_pulls)
{
struct thread *thread, *tmp;
int cpu;
cpu = thread_runq_cpu(runq);
list_for_each_entry_safe(&fs_runq->threads, thread, tmp,
fs_data.runq_node) {
if (thread == remote_runq->current) {
continue;
}
/*
* The pin level is changed without explicit synchronization.
* However, it can only be changed by its owning thread. As threads
* currently running aren't considered for migration, the thread had
* to be preempted and invoke the scheduler. Since balancer threads
* acquire the run queue lock, there is strong ordering between
* changing the pin level and setting the current thread of a
* run queue.
*
* TODO Review comment.
*/
if (thread->pin_level != 0) {
continue;
}
if (!cpumap_test(&thread->cpumap, cpu)) {
continue;
}
/*
* Make sure at least one thread is pulled if possible. If one or more
* thread has already been pulled, take weights into account.
*/
if ((nr_pulls != 0)
&& ((runq->fs_weight + thread->fs_data.weight)
> (remote_runq->fs_weight - thread->fs_data.weight))) {
break;
}
thread_runq_remove(remote_runq, thread);
/* Don't discard the work already accounted for */
thread->fs_data.round = runq->fs_round;
thread_runq_add(runq, thread);
nr_pulls++;
if (nr_pulls == THREAD_MAX_MIGRATIONS) {
break;
}
}
return nr_pulls;
}
static unsigned int
thread_sched_fs_balance_migrate(struct thread_runq *runq,
struct thread_runq *remote_runq,
unsigned long highest_round)
{
unsigned int nr_pulls;
nr_pulls = 0;
if (!thread_sched_fs_balance_eligible(remote_runq, highest_round)) {
goto out;
}
nr_pulls = thread_sched_fs_balance_pull(runq, remote_runq,
remote_runq->fs_runq_active, 0);
if (nr_pulls == THREAD_MAX_MIGRATIONS) {
goto out;
}
/*
* Threads in the expired queue of a processor in round highest are
* actually in round highest + 1.
*/
if (remote_runq->fs_round != highest_round) {
nr_pulls = thread_sched_fs_balance_pull(runq, remote_runq,
remote_runq->fs_runq_expired,
nr_pulls);
}
out:
return nr_pulls;
}
/*
* Inter-processor load balancing for fair-scheduling threads.
*
* Preemption must be disabled, and the local run queue must be locked when
* calling this function. If balancing actually occurs, the lock will be
* released and preemption enabled when needed.
*/
static void
thread_sched_fs_balance(struct thread_runq *runq, unsigned long *flags)
{
struct thread_runq *remote_runq;
unsigned long highest_round;
unsigned int nr_migrations;
int i;
/*
* Grab the highest round now and only use the copy so the value is stable
* during the balancing operation.
*/
highest_round = thread_fs_highest_round;
if ((runq->fs_round != highest_round)
&& (runq->fs_runq_expired->nr_threads != 0)) {
goto no_migration;
}
spinlock_unlock_intr_restore(&runq->lock, *flags);
thread_preempt_enable();
remote_runq = thread_sched_fs_balance_scan(runq, highest_round);
if (remote_runq != NULL) {
thread_preempt_disable_intr_save(flags);
thread_runq_double_lock(runq, remote_runq);
nr_migrations = thread_sched_fs_balance_migrate(runq, remote_runq,
highest_round);
spinlock_unlock(&remote_runq->lock);
if (nr_migrations != 0) {
return;
}
spinlock_unlock_intr_restore(&runq->lock, *flags);
thread_preempt_enable();
}
/*
* The scan or the migration failed. As a fallback, make another, simpler
* pass on every run queue, and stop as soon as at least one thread could
* be successfully pulled.
*/
cpumap_for_each(&thread_active_runqs, i) {
remote_runq = percpu_ptr(thread_runq, i);
if (remote_runq == runq) {
continue;
}
thread_preempt_disable_intr_save(flags);
thread_runq_double_lock(runq, remote_runq);
nr_migrations = thread_sched_fs_balance_migrate(runq, remote_runq,
highest_round);
spinlock_unlock(&remote_runq->lock);
if (nr_migrations != 0) {
return;
}
spinlock_unlock_intr_restore(&runq->lock, *flags);
thread_preempt_enable();
}
thread_preempt_disable();
spinlock_lock_intr_save(&runq->lock, flags);
no_migration:
/*
* No thread could be migrated. Check the active run queue, as another
* processor might have added threads while the balancer was running.
* If the run queue is still empty, switch to the next round. The run
* queue lock must remain held until the next scheduling decision to
* prevent a remote balancer thread from stealing active threads.
*/
if (runq->fs_runq_active->nr_threads == 0) {
thread_sched_fs_start_next_round(runq);
}
}
static struct thread_runq *
thread_sched_idle_select_runq(struct thread *thread)
{
(void)thread;
panic("thread: idler threads cannot be awaken");
}
static noreturn void
thread_sched_idle_panic(void)
{
panic("thread: only idle threads are allowed in the idle class");
}
static void
thread_sched_idle_add(struct thread_runq *runq, struct thread *thread)
{
(void)runq;
(void)thread;
thread_sched_idle_panic();
}
static void
thread_sched_idle_remove(struct thread_runq *runq, struct thread *thread)
{
(void)runq;
(void)thread;
thread_sched_idle_panic();
}
static struct thread *
thread_sched_idle_get_next(struct thread_runq *runq)
{
return runq->idler;
}
static unsigned int
thread_sched_idle_get_global_priority(unsigned short priority)
{
(void)priority;
return THREAD_SCHED_GLOBAL_PRIO_IDLE;
}
static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES]
= {
[THREAD_SCHED_CLASS_RT] = {
.select_runq = thread_sched_rt_select_runq,
.add = thread_sched_rt_add,
.remove = thread_sched_rt_remove,
.put_prev = thread_sched_rt_put_prev,
.get_next = thread_sched_rt_get_next,
.reset_priority = thread_sched_rt_reset_priority,
.update_priority = NULL,
.get_global_priority = thread_sched_rt_get_global_priority,
.set_next = thread_sched_rt_set_next,
.tick = thread_sched_rt_tick,
},
[THREAD_SCHED_CLASS_FS] = {
.select_runq = thread_sched_fs_select_runq,
.add = thread_sched_fs_add,
.remove = thread_sched_fs_remove,
.put_prev = thread_sched_fs_put_prev,
.get_next = thread_sched_fs_get_next,
.reset_priority = thread_sched_fs_reset_priority,
.update_priority = thread_sched_fs_update_priority,
.get_global_priority = thread_sched_fs_get_global_priority,
.set_next = thread_sched_fs_set_next,
.tick = thread_sched_fs_tick,
},
[THREAD_SCHED_CLASS_IDLE] = {
.select_runq = thread_sched_idle_select_runq,
.add = thread_sched_idle_add,
.remove = thread_sched_idle_remove,
.put_prev = NULL,
.get_next = thread_sched_idle_get_next,
.reset_priority = NULL,
.update_priority = NULL,
.get_global_priority = thread_sched_idle_get_global_priority,
.set_next = NULL,
.tick = NULL,
},
};
static void
thread_set_user_sched_policy(struct thread *thread, unsigned char sched_policy)
{
thread->user_sched_data.sched_policy = sched_policy;
}
static void
thread_set_user_sched_class(struct thread *thread, unsigned char sched_class)
{
thread->user_sched_data.sched_class = sched_class;
}
static void
thread_set_user_priority(struct thread *thread, unsigned short priority)
{
const struct thread_sched_ops *ops;
ops = thread_get_user_sched_ops(thread);
thread->user_sched_data.priority = priority;
thread->user_sched_data.global_priority
= ops->get_global_priority(priority);
}
static void
thread_update_user_priority(struct thread *thread, unsigned short priority)
{
thread_set_user_priority(thread, priority);
}
static void
thread_set_real_sched_policy(struct thread *thread, unsigned char sched_policy)
{
thread->real_sched_data.sched_policy = sched_policy;
}
static void
thread_set_real_sched_class(struct thread *thread, unsigned char sched_class)
{
thread->real_sched_data.sched_class = sched_class;
}
static void
thread_set_real_priority(struct thread *thread, unsigned short priority)
{
const struct thread_sched_ops *ops;
ops = thread_get_real_sched_ops(thread);
thread->real_sched_data.priority = priority;
thread->real_sched_data.global_priority
= ops->get_global_priority(priority);
if (ops->reset_priority != NULL) {
ops->reset_priority(thread, priority);
}
}
static void
thread_update_real_priority(struct thread *thread, unsigned short priority)
{
const struct thread_sched_ops *ops;
ops = thread_get_real_sched_ops(thread);
thread->real_sched_data.priority = priority;
thread->real_sched_data.global_priority
= ops->get_global_priority(priority);
if (ops->update_priority != NULL) {
ops->update_priority(thread, priority);
}
}
static void
thread_reset_real_priority(struct thread *thread)
{
const struct thread_sched_ops *ops;
struct thread_sched_data *user, *real;
user = &thread->user_sched_data;
real = &thread->real_sched_data;
*real = *user;
thread->boosted = false;
ops = thread_get_user_sched_ops(thread);
if (ops->reset_priority != NULL) {
ops->reset_priority(thread, real->priority);
}
}
static void __init
thread_init_booter(unsigned int cpu)
{
struct thread *booter;
/* Initialize only what's needed during bootstrap */
booter = &thread_booters[cpu];
booter->nr_refs = 0; /* Make sure booters aren't destroyed */
booter->flags = 0;
booter->intr_level = 0;
booter->preempt_level = 1;
rcu_reader_init(&booter->rcu_reader);
cpumap_fill(&booter->cpumap);
thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
thread_set_user_priority(booter, 0);
thread_reset_real_priority(booter);
memset(booter->tsd, 0, sizeof(booter->tsd));
booter->task = task_get_kernel_task();
snprintf(booter->name, sizeof(booter->name),
THREAD_KERNEL_PREFIX "thread_boot/%u", cpu);
}
static int __init
thread_setup_booter(void)
{
tcb_set_current(&thread_booters[0].tcb);
thread_init_booter(0);
return 0;
}
INIT_OP_DEFINE(thread_setup_booter,
INIT_OP_DEP(tcb_setup, true));
static int __init
thread_bootstrap(void)
{
cpumap_zero(&thread_active_runqs);
cpumap_zero(&thread_idle_runqs);
thread_fs_highest_round = THREAD_FS_INITIAL_ROUND;
cpumap_set(&thread_active_runqs, 0);
thread_runq_init(cpu_local_ptr(thread_runq), 0, &thread_booters[0]);
return 0;
}
INIT_OP_DEFINE(thread_bootstrap,
INIT_OP_DEP(syscnt_setup, true),
INIT_OP_DEP(thread_setup_booter, true));
void
thread_main(void (*fn)(void *), void *arg)
{
struct thread *thread;
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
thread = thread_self();
thread_runq_schedule_load(thread);
spinlock_unlock(&thread_runq_local()->lock);
cpu_intr_enable();
thread_preempt_enable();
fn(arg);
thread_exit();
}
static void
thread_destroy_tsd(struct thread *thread)
{
void *ptr;
unsigned int i;
i = 0;
while (i < thread_nr_keys) {
if ((thread->tsd[i] == NULL) || (thread_dtors[i] == NULL)) {
i++;
continue;
}
/*
* Follow the POSIX description of TSD: set the key to NULL before
* calling the destructor and repeat as long as it's not NULL.
*/
ptr = thread->tsd[i];
thread->tsd[i] = NULL;
thread_dtors[i](ptr);
if (thread->tsd[i] == NULL) {
i++;
}
}
}
static int
thread_init(struct thread *thread, void *stack,
const struct thread_attr *attr,
void (*fn)(void *), void *arg)
{
struct thread *caller;
struct task *task;
struct cpumap *cpumap;
int error;
caller = thread_self();
task = (attr->task == NULL) ? caller->task : attr->task;
cpumap = (attr->cpumap == NULL) ? &caller->cpumap : attr->cpumap;
assert(attr->policy < ARRAY_SIZE(thread_policy_table));
thread->nr_refs = 1;
thread->flags = 0;
thread->runq = NULL;
thread->in_runq = false;
thread_set_wchan(thread, thread, "init");
thread->state = THREAD_SLEEPING;
thread->priv_sleepq = sleepq_create();
if (thread->priv_sleepq == NULL) {
error = ENOMEM;
goto error_sleepq;
}
thread->priv_turnstile = turnstile_create();
if (thread->priv_turnstile == NULL) {
error = ENOMEM;
goto error_turnstile;
}
turnstile_td_init(&thread->turnstile_td);
thread->propagate_priority = false;
thread->preempt_level = THREAD_SUSPEND_PREEMPT_LEVEL;
thread->pin_level = 0;
thread->intr_level = 0;
rcu_reader_init(&thread->rcu_reader);
cpumap_copy(&thread->cpumap, cpumap);
thread_set_user_sched_policy(thread, attr->policy);
thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
thread_set_user_priority(thread, attr->priority);
thread_reset_real_priority(thread);
memset(thread->tsd, 0, sizeof(thread->tsd));
thread->join_waiter = NULL;
spinlock_init(&thread->join_lock);
thread->terminating = false;
thread->task = task;
thread->stack = stack;
strlcpy(thread->name, attr->name, sizeof(thread->name));
if (attr->flags & THREAD_ATTR_DETACHED) {
thread->flags |= THREAD_DETACHED;
}
#ifdef CONFIG_PERFMON
error = perfmon_thread_init(thread);
if (error) {
goto error_perfmon;
}
#endif /* CONFIG_PERFMON */
error = tcb_build(&thread->tcb, stack, fn, arg);
if (error) {
goto error_tcb;
}
task_add_thread(task, thread);
return 0;
error_tcb:
#ifdef CONFIG_PERFMON
perfmon_thread_destroy(thread);
error_perfmon:
#endif /* CONFIG_PERFMON */
thread_destroy_tsd(thread);
turnstile_destroy(thread->priv_turnstile);
error_turnstile:
sleepq_destroy(thread->priv_sleepq);
error_sleepq:
return error;
}
static struct thread_runq *
thread_lock_runq(struct thread *thread, unsigned long *flags)
{
struct thread_runq *runq;
for (;;) {
runq = atomic_load(&thread->runq, ATOMIC_RELAXED);
spinlock_lock_intr_save(&runq->lock, flags);
if (runq == atomic_load(&thread->runq, ATOMIC_RELAXED)) {
return runq;
}
spinlock_unlock_intr_restore(&runq->lock, *flags);
}
}
static void
thread_unlock_runq(struct thread_runq *runq, unsigned long flags)
{
spinlock_unlock_intr_restore(&runq->lock, flags);
}
#ifdef CONFIG_THREAD_STACK_GUARD
#include
#include
#include
static void *
thread_alloc_stack(void)
{
struct vm_page *first_page, *last_page;
phys_addr_t first_pa, last_pa;
struct pmap *kernel_pmap;
size_t stack_size;
uintptr_t va;
void *mem;
int error;
kernel_pmap = pmap_get_kernel_pmap();
stack_size = vm_page_round(TCB_STACK_SIZE);
mem = vm_kmem_alloc((PAGE_SIZE * 2) + stack_size);
if (mem == NULL) {
return NULL;
}
va = (uintptr_t)mem;
/*
* TODO Until memory protection is implemented, use the pmap system
* to remove mappings.
*/
error = pmap_kextract(va, &first_pa);
assert(!error);
error = pmap_kextract(va + PAGE_SIZE + stack_size, &last_pa);
assert(!error);
first_page = vm_page_lookup(first_pa);
assert(first_page != NULL);
last_page = vm_page_lookup(last_pa);
assert(last_page != NULL);
pmap_remove(kernel_pmap, va, cpumap_all());
pmap_remove(kernel_pmap, va + PAGE_SIZE + stack_size, cpumap_all());
pmap_update(kernel_pmap);
return (void *)va + PAGE_SIZE;
}
static void
thread_free_stack(void *stack)
{
size_t stack_size;
void *va;
stack_size = vm_page_round(TCB_STACK_SIZE);
va = (void *)stack - PAGE_SIZE;
vm_kmem_free(va, (PAGE_SIZE * 2) + stack_size);
}
#else /* CONFIG_THREAD_STACK_GUARD */
static void *
thread_alloc_stack(void)
{
return kmem_cache_alloc(&thread_stack_cache);
}
static void
thread_free_stack(void *stack)
{
kmem_cache_free(&thread_stack_cache, stack);
}
#endif /* CONFIG_THREAD_STACK_GUARD */
static void
thread_destroy(struct thread *thread)
{
assert(thread != thread_self());
assert(thread->state == THREAD_DEAD);
/* See task_info() */
task_remove_thread(thread->task, thread);
#ifdef CONFIG_PERFMON
perfmon_thread_destroy(thread);
#endif
thread_destroy_tsd(thread);
turnstile_destroy(thread->priv_turnstile);
sleepq_destroy(thread->priv_sleepq);
thread_free_stack(thread->stack);
tcb_cleanup(&thread->tcb);
kmem_cache_free(&thread_cache, thread);
}
static void
thread_join_common(struct thread *thread)
{
struct thread_runq *runq;
unsigned long flags, state;
struct thread *self;
self = thread_self();
assert(thread != self);
spinlock_lock(&thread->join_lock);
assert(!thread->join_waiter);
thread->join_waiter = self;
while (!thread->terminating) {
thread_sleep(&thread->join_lock, thread, "exit");
}
spinlock_unlock(&thread->join_lock);
do {
runq = thread_lock_runq(thread, &flags);
state = thread->state;
thread_unlock_runq(runq, flags);
} while (state != THREAD_DEAD);
thread_destroy(thread);
}
void thread_terminate(struct thread *thread)
{
spinlock_lock(&thread->join_lock);
thread->terminating = true;
thread_wakeup(thread->join_waiter);
spinlock_unlock(&thread->join_lock);
}
static void
thread_balance_idle_tick(struct thread_runq *runq)
{
assert(runq->idle_balance_ticks != 0);
/*
* Interrupts can occur early, at a time the balancer thread hasn't been
* created yet.
*/
if (runq->balancer == NULL) {
return;
}
runq->idle_balance_ticks--;
if (runq->idle_balance_ticks == 0) {
thread_runq_wakeup_balancer(runq);
}
}
static void
thread_balance(void *arg)
{
struct thread_runq *runq;
struct thread *self;
unsigned long flags;
runq = arg;
self = runq->balancer;
assert(self == runq->balancer);
thread_preempt_disable();
spinlock_lock_intr_save(&runq->lock, &flags);
for (;;) {
runq->idle_balance_ticks = THREAD_IDLE_BALANCE_TICKS;
thread_set_wchan(self, runq, "runq");
self->state = THREAD_SLEEPING;
runq = thread_runq_schedule(runq);
assert(runq == arg);
/*
* This function may temporarily enable preemption and release the
* run queue lock, but on return, the lock must remain held until this
* balancer thread sleeps.
*/
thread_sched_fs_balance(runq, &flags);
}
}
static void __init
thread_setup_balancer(struct thread_runq *runq)
{
char name[THREAD_NAME_SIZE];
struct thread_attr attr;
struct thread *balancer;
struct cpumap *cpumap;
int error;
error = cpumap_create(&cpumap);
if (error) {
panic("thread: unable to create balancer thread CPU map");
}
cpumap_zero(cpumap);
cpumap_set(cpumap, thread_runq_cpu(runq));
snprintf(name, sizeof(name), THREAD_KERNEL_PREFIX "thread_balance/%u",
thread_runq_cpu(runq));
thread_attr_init(&attr, name);
thread_attr_set_cpumap(&attr, cpumap);
thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
thread_attr_set_priority(&attr, THREAD_SCHED_RT_PRIO_MIN);
error = thread_create(&balancer, &attr, thread_balance, runq);
cpumap_destroy(cpumap);
if (error) {
panic("thread: unable to create balancer thread");
}
runq->balancer = balancer;
}
static void
thread_idle(void *arg)
{
struct thread *self;
(void)arg;
self = thread_self();
for (;;) {
thread_preempt_disable();
for (;;) {
cpu_intr_disable();
if (thread_test_flag(self, THREAD_YIELD)) {
cpu_intr_enable();
break;
}
cpu_idle();
}
thread_preempt_enable();
}
}
static void __init
thread_setup_idler(struct thread_runq *runq)
{
char name[THREAD_NAME_SIZE];
struct thread_attr attr;
struct thread *idler;
struct cpumap *cpumap;
void *stack;
int error;
error = cpumap_create(&cpumap);
if (error) {
panic("thread: unable to allocate idler thread CPU map");
}
cpumap_zero(cpumap);
cpumap_set(cpumap, thread_runq_cpu(runq));
idler = kmem_cache_alloc(&thread_cache);
if (idler == NULL) {
panic("thread: unable to allocate idler thread");
}
stack = thread_alloc_stack();
if (stack == NULL) {
panic("thread: unable to allocate idler thread stack");
}
snprintf(name, sizeof(name), THREAD_KERNEL_PREFIX "thread_idle/%u",
thread_runq_cpu(runq));
thread_attr_init(&attr, name);
thread_attr_set_cpumap(&attr, cpumap);
thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_IDLE);
error = thread_init(idler, stack, &attr, thread_idle, NULL);
if (error) {
panic("thread: unable to initialize idler thread");
}
cpumap_destroy(cpumap);
/* An idler thread needs special tuning */
thread_clear_wchan(idler);
idler->state = THREAD_RUNNING;
idler->runq = runq;
runq->idler = idler;
}
static void __init
thread_setup_runq(struct thread_runq *runq)
{
thread_setup_balancer(runq);
thread_setup_idler(runq);
}
#ifdef CONFIG_SHELL
/*
* This function is meant for debugging only. As a result, it uses a weak
* locking policy which allows tracing threads which state may mutate during
* tracing.
*/
static void
thread_shell_trace(int argc, char *argv[])
{
const char *task_name, *thread_name;
struct thread_runq *runq;
struct thread *thread;
unsigned long flags;
struct task *task;
int error;
if (argc != 3) {
error = EINVAL;
goto error;
}
task_name = argv[1];
thread_name = argv[2];
task = task_lookup(task_name);
if (task == NULL) {
error = ESRCH;
goto error;
}
thread = task_lookup_thread(task, thread_name);
task_unref(task);
if (thread == NULL) {
error = ESRCH;
goto error;
}
runq = thread_lock_runq(thread, &flags);
if (thread == runq->current) {
printf("thread: trace: thread is running\n");
} else {
tcb_trace(&thread->tcb);
}
thread_unlock_runq(runq, flags);
thread_unref(thread);
return;
error:
printf("thread: trace: %s\n", strerror(error));
}
static struct shell_cmd thread_shell_cmds[] = {
SHELL_CMD_INITIALIZER("thread_trace", thread_shell_trace,
"thread_trace ",
"display the stack trace of a given thread"),
};
static int __init
thread_setup_shell(void)
{
SHELL_REGISTER_CMDS(thread_shell_cmds);
return 0;
}
INIT_OP_DEFINE(thread_setup_shell,
INIT_OP_DEP(printf_setup, true),
INIT_OP_DEP(shell_setup, true),
INIT_OP_DEP(task_setup, true),
INIT_OP_DEP(thread_setup, true));
#endif /* CONFIG_SHELL */
static void __init
thread_setup_common(unsigned int cpu)
{
assert(cpu != 0);
cpumap_set(&thread_active_runqs, cpu);
thread_init_booter(cpu);
thread_runq_init(percpu_ptr(thread_runq, cpu), cpu, &thread_booters[cpu]);
}
static int __init
thread_setup(void)
{
int cpu;
for (cpu = 1; (unsigned int)cpu < cpu_count(); cpu++) {
thread_setup_common(cpu);
}
kmem_cache_init(&thread_cache, "thread", sizeof(struct thread),
CPU_L1_SIZE, NULL, 0);
#ifndef CONFIG_THREAD_STACK_GUARD
kmem_cache_init(&thread_stack_cache, "thread_stack", TCB_STACK_SIZE,
CPU_DATA_ALIGN, NULL, 0);
#endif /* CONFIG_THREAD_STACK_GUARD */
cpumap_for_each(&thread_active_runqs, cpu) {
thread_setup_runq(percpu_ptr(thread_runq, cpu));
}
return 0;
}
#ifdef CONFIG_THREAD_STACK_GUARD
#define THREAD_STACK_GUARD_INIT_OP_DEPS \
INIT_OP_DEP(vm_kmem_setup, true), \
INIT_OP_DEP(vm_map_setup, true), \
INIT_OP_DEP(vm_page_setup, true),
#else /* CONFIG_THREAD_STACK_GUARD */
#define THREAD_STACK_GUARD_INIT_OP_DEPS
#endif /* CONFIG_THREAD_STACK_GUARD */
#ifdef CONFIG_PERFMON
#define THREAD_PERFMON_INIT_OP_DEPS \
INIT_OP_DEP(perfmon_bootstrap, true),
#else /* CONFIG_PERFMON */
#define THREAD_PERFMON_INIT_OP_DEPS
#endif /* CONFIG_PERFMON */
INIT_OP_DEFINE(thread_setup,
INIT_OP_DEP(cpumap_setup, true),
INIT_OP_DEP(kmem_setup, true),
INIT_OP_DEP(pmap_setup, true),
INIT_OP_DEP(sleepq_setup, true),
INIT_OP_DEP(task_setup, true),
INIT_OP_DEP(thread_bootstrap, true),
INIT_OP_DEP(turnstile_setup, true),
THREAD_PERFMON_INIT_OP_DEPS
THREAD_STACK_GUARD_INIT_OP_DEPS
);
void __init
thread_ap_setup(void)
{
tcb_set_current(&thread_booters[cpu_id()].tcb);
}
int
thread_create(struct thread **threadp, const struct thread_attr *attr,
void (*fn)(void *), void *arg)
{
struct thread *thread;
void *stack;
int error;
if (attr->cpumap != NULL) {
error = cpumap_check(attr->cpumap);
if (error) {
return error;
}
}
thread = kmem_cache_alloc(&thread_cache);
if (thread == NULL) {
error = ENOMEM;
goto error_thread;
}
stack = thread_alloc_stack();
if (stack == NULL) {
error = ENOMEM;
goto error_stack;
}
error = thread_init(thread, stack, attr, fn, arg);
if (error) {
goto error_init;
}
/*
* The new thread address must be written before the thread is started
* in case it's passed to it.
*/
*threadp = thread;
thread_wakeup(thread);
return 0;
error_init:
thread_free_stack(stack);
error_stack:
kmem_cache_free(&thread_cache, thread);
error_thread:
return error;
}
static void
thread_reap(struct work *work)
{
struct thread_zombie *zombie;
zombie = structof(work, struct thread_zombie, work);
thread_join_common(zombie->thread);
}
void
thread_exit(void)
{
struct thread_zombie zombie;
struct thread_runq *runq;
struct thread *thread;
unsigned long flags;
thread = thread_self();
if (thread_test_flag(thread, THREAD_DETACHED)) {
zombie.thread = thread;
work_init(&zombie.work, thread_reap);
work_schedule(&zombie.work, 0);
}
/*
* Disable preemption before dropping the reference, as this may
* trigger the active state poll of the join operation. Doing so
* keeps the duration of that active wait minimum.
*/
thread_preempt_disable();
thread_unref(thread);
runq = thread_runq_local();
spinlock_lock_intr_save(&runq->lock, &flags);
thread->state = THREAD_DEAD;
thread_runq_schedule(runq);
panic("thread: dead thread walking");
}
void
thread_join(struct thread *thread)
{
assert(!thread_test_flag(thread, THREAD_DETACHED));
thread_join_common(thread);
}
static int
thread_wakeup_common(struct thread *thread, int error)
{
struct thread_runq *runq;
unsigned long flags;
if ((thread == NULL) || (thread == thread_self())) {
return EINVAL;
}
/*
* There is at most one reference on threads that were never dispatched,
* in which case there is no need to lock anything.
*/
if (thread->runq == NULL) {
assert(thread->state != THREAD_RUNNING);
thread_clear_wchan(thread);
thread->state = THREAD_RUNNING;
} else {
runq = thread_lock_runq(thread, &flags);
if (thread->state == THREAD_RUNNING) {
thread_unlock_runq(runq, flags);
return EINVAL;
}
thread_clear_wchan(thread);
thread->state = THREAD_RUNNING;
thread_unlock_runq(runq, flags);
}
thread_preempt_disable_intr_save(&flags);
if (thread->pin_level == 0) {
runq = thread_get_real_sched_ops(thread)->select_runq(thread);
} else {
/*
* This access doesn't need to be atomic, as the current thread is
* the only one which may update the member.
*/
runq = thread->runq;
spinlock_lock(&runq->lock);
}
thread->wakeup_error = error;
thread_runq_wakeup(runq, thread);
spinlock_unlock(&runq->lock);
thread_preempt_enable_intr_restore(flags);
return 0;
}
int
thread_wakeup(struct thread *thread)
{
return thread_wakeup_common(thread, 0);
}
struct thread_timeout_waiter {
struct thread *thread;
struct timer timer;
};
static void
thread_timeout(struct timer *timer)
{
struct thread_timeout_waiter *waiter;
waiter = structof(timer, struct thread_timeout_waiter, timer);
thread_wakeup_common(waiter->thread, ETIMEDOUT);
}
static int
thread_sleep_common(struct spinlock *interlock, const void *wchan_addr,
const char *wchan_desc, bool timed, uint64_t ticks)
{
struct thread_timeout_waiter waiter;
struct thread_runq *runq;
struct thread *thread;
unsigned long flags;
thread = thread_self();
if (timed) {
waiter.thread = thread;
timer_init(&waiter.timer, thread_timeout, TIMER_INTR);
timer_schedule(&waiter.timer, ticks);
}
runq = thread_runq_local();
spinlock_lock_intr_save(&runq->lock, &flags);
if (interlock != NULL) {
thread_preempt_disable();
spinlock_unlock(interlock);
}
thread_set_wchan(thread, wchan_addr, wchan_desc);
thread->state = THREAD_SLEEPING;
runq = thread_runq_schedule(runq);
assert(thread->state == THREAD_RUNNING);
spinlock_unlock_intr_restore(&runq->lock, flags);
if (timed) {
timer_cancel(&waiter.timer);
}
if (interlock != NULL) {
spinlock_lock(interlock);
thread_preempt_enable_no_resched();
}
return thread->wakeup_error;
}
void
thread_sleep(struct spinlock *interlock, const void *wchan_addr,
const char *wchan_desc)
{
int error;
error = thread_sleep_common(interlock, wchan_addr, wchan_desc, false, 0);
assert(!error);
}
int
thread_timedsleep(struct spinlock *interlock, const void *wchan_addr,
const char *wchan_desc, uint64_t ticks)
{
return thread_sleep_common(interlock, wchan_addr, wchan_desc, true, ticks);
}
void
thread_delay(uint64_t ticks, bool absolute)
{
thread_preempt_disable();
if (!absolute) {
/* Add a tick to avoid quantization errors */
ticks += clock_get_time() + 1;
}
thread_timedsleep(NULL, thread_self(), "delay", ticks);
thread_preempt_enable();
}
static void __init
thread_boot_barrier(void)
{
unsigned int nr_cpus;
assert(!cpu_intr_enabled());
atomic_add(&thread_nr_boot_cpus, 1, ATOMIC_RELAXED);
for (;;) {
nr_cpus = atomic_load(&thread_nr_boot_cpus, ATOMIC_SEQ_CST);
if (nr_cpus == cpu_count()) {
break;
}
cpu_pause();
}
}
void __init
thread_run_scheduler(void)
{
struct thread_runq *runq;
struct thread *thread;
assert(!cpu_intr_enabled());
thread_boot_barrier();
runq = thread_runq_local();
thread = thread_self();
assert(thread == runq->current);
assert(thread->preempt_level == (THREAD_SUSPEND_PREEMPT_LEVEL - 1));
sref_register();
spinlock_lock(&runq->lock);
thread = thread_runq_get_next(thread_runq_local());
spinlock_transfer_owner(&runq->lock, thread);
tcb_load(&thread->tcb);
}
void
thread_yield(void)
{
struct thread_runq *runq;
struct thread *thread;
unsigned long flags;
thread = thread_self();
if (!thread_preempt_enabled()) {
return;
}
do {
thread_preempt_disable();
runq = thread_runq_local();
spinlock_lock_intr_save(&runq->lock, &flags);
runq = thread_runq_schedule(runq);
spinlock_unlock_intr_restore(&runq->lock, flags);
thread_preempt_enable_no_resched();
} while (thread_test_flag(thread, THREAD_YIELD));
}
void
thread_schedule(void)
{
if (likely(!thread_test_flag(thread_self(), THREAD_YIELD))) {
return;
}
thread_yield();
}
void
thread_schedule_intr(void)
{
struct thread_runq *runq;
assert(thread_check_intr_context());
runq = thread_runq_local();
syscnt_inc(&runq->sc_schedule_intrs);
}
void
thread_report_periodic_event(void)
{
const struct thread_sched_ops *ops;
struct thread_runq *runq;
struct thread *thread;
assert(thread_check_intr_context());
runq = thread_runq_local();
thread = thread_self();
spinlock_lock(&runq->lock);
if (runq->nr_threads == 0) {
thread_balance_idle_tick(runq);
}
ops = thread_get_real_sched_ops(thread);
if (ops->tick != NULL) {
ops->tick(runq, thread);
}
spinlock_unlock(&runq->lock);
}
unsigned int
thread_cpu(const struct thread *thread)
{
assert(thread->runq);
return thread->runq->cpu;
}
char
thread_state_to_chr(const struct thread *thread)
{
switch (thread->state) {
case THREAD_RUNNING:
return 'R';
case THREAD_SLEEPING:
return 'S';
case THREAD_DEAD:
return 'Z';
default:
panic("thread: unknown state");
}
}
const char *
thread_sched_class_to_str(unsigned char sched_class)
{
switch (sched_class) {
case THREAD_SCHED_CLASS_RT:
return "rt";
case THREAD_SCHED_CLASS_FS:
return "fs";
case THREAD_SCHED_CLASS_IDLE:
return "idle";
default:
panic("thread: unknown scheduling class");
}
}
void
thread_setscheduler(struct thread *thread, unsigned char policy,
unsigned short priority)
{
struct thread_runq *runq;
struct turnstile_td *td;
unsigned long flags;
bool requeue, current, update;
td = thread_turnstile_td(thread);
turnstile_td_lock(td);
runq = thread_lock_runq(thread, &flags);
if ((thread_user_sched_policy(thread) == policy)
&& (thread_user_priority(thread) == priority)) {
goto out;
}
requeue = thread->in_runq;
if (!requeue) {
current = false;
} else {
if (thread != runq->current) {
current = false;
} else {
thread_runq_put_prev(runq, thread);
current = true;
}
thread_runq_remove(runq, thread);
}
if (thread_user_sched_policy(thread) == policy) {
thread_update_user_priority(thread, priority);
update = true;
} else {
thread_set_user_sched_policy(thread, policy);
thread_set_user_sched_class(thread, thread_policy_to_class(policy));
thread_set_user_priority(thread, priority);
update = false;
}
if (thread->boosted) {
if (thread_user_global_priority(thread)
>= thread_real_global_priority(thread)) {
thread_reset_real_priority(thread);
}
} else {
if (update) {
thread_update_real_priority(thread, priority);
} else {
thread_set_real_sched_policy(thread, policy);
thread_set_real_sched_class(thread, thread_policy_to_class(policy));
thread_set_real_priority(thread, priority);
}
}
if (requeue) {
thread_runq_add(runq, thread);
if (current) {
thread_runq_set_next(runq, thread);
}
}
out:
thread_unlock_runq(runq, flags);
turnstile_td_unlock(td);
turnstile_td_propagate_priority(td);
}
void
thread_pi_setscheduler(struct thread *thread, unsigned char policy,
unsigned short priority)
{
const struct thread_sched_ops *ops;
struct thread_runq *runq;
struct turnstile_td *td;
unsigned int global_priority;
unsigned long flags;
bool requeue, current;
td = thread_turnstile_td(thread);
turnstile_td_assert_lock(td);
ops = thread_get_sched_ops(thread_policy_to_class(policy));
global_priority = ops->get_global_priority(priority);
runq = thread_lock_runq(thread, &flags);
if ((thread_real_sched_policy(thread) == policy)
&& (thread_real_priority(thread) == priority)) {
goto out;
}
syscnt_inc(&runq->sc_boosts);
requeue = thread->in_runq;
if (!requeue) {
current = false;
} else {
if (thread != runq->current) {
current = false;
} else {
thread_runq_put_prev(runq, thread);
current = true;
}
thread_runq_remove(runq, thread);
}
if (global_priority <= thread_user_global_priority(thread)) {
thread_reset_real_priority(thread);
} else {
if (thread_real_sched_policy(thread) == policy) {
thread_update_real_priority(thread, priority);
} else {
thread_set_real_sched_policy(thread, policy);
thread_set_real_sched_class(thread, thread_policy_to_class(policy));
thread_set_real_priority(thread, priority);
}
thread->boosted = true;
}
if (requeue) {
thread_runq_add(runq, thread);
if (current) {
thread_runq_set_next(runq, thread);
}
}
out:
thread_unlock_runq(runq, flags);
}
void
thread_propagate_priority(void)
{
struct thread *thread;
/*
* Although it's possible to propagate priority with preemption
* disabled, the operation can be too expensive to allow it.
*/
if (!thread_preempt_enabled()) {
thread_set_priority_propagation_needed();
return;
}
thread = thread_self();
/* Clear before propagation to avoid infinite recursion */
thread->propagate_priority = false;
turnstile_td_propagate_priority(thread_turnstile_td(thread));
}
void
thread_key_create(unsigned int *keyp, thread_dtor_fn_t dtor)
{
unsigned int key;
key = atomic_fetch_add(&thread_nr_keys, 1, ATOMIC_RELAXED);
if (key >= THREAD_KEYS_MAX) {
panic("thread: maximum number of keys exceeded");
}
thread_dtors[key] = dtor;
*keyp = key;
}
bool
thread_is_running(const struct thread *thread)
{
const struct thread_runq *runq;
runq = atomic_load(&thread->runq, ATOMIC_RELAXED);
return (runq != NULL)
&& (atomic_load(&runq->current, ATOMIC_RELAXED) == thread);
}