/* * Copyright (c) 2012-2018 Richard Braun. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * * The scheduling algorithm implemented by this module, named Distributed * Group Ratio Round-Robin (DGR3), is based on the following papers : * - "Group Ratio Round-Robin: O(1) Proportional Share Scheduling for * Uniprocessor and Multiprocessor Systems" by Bogdan Caprita, Wong Chun * Chan, Jason Nieh, Clifford Stein and Haoqiang Zheng. * - "Efficient and Scalable Multiprocessor Fair Scheduling Using Distributed * Weighted Round-Robin" by Tong li, Dan Baumberger and Scott Hahn. * * Note that the Group Ratio Round-Robin (GR3) paper offers a multiprocessor * extension, but based on a single global queue, which strongly limits its * scalability on systems with many processors. That extension isn't used in * this implementation. * * The basic idea is to use GR3 for processor-local scheduling, and Distributed * Weighted Round-Robin (DWRR) for inter-processor load balancing. These * algorithms were chosen for several reasons. To begin with, they provide * fair scheduling, a very desirable property for a modern scheduler. Next, * being based on round-robin, their algorithmic complexity is very low (GR3 * has O(1) scheduling complexity, and O(g) complexity on thread addition * or removal, g being the number of groups, with one group per priority, a * low number in practice). Finally, they're very simple to implement, making * them easy to adjust and maintain. * * Both algorithms are actually slightly modified for efficiency. First, this * version of GR3 is simplified by mapping exactly one group to one priority, * and in turn, one weight. This is possible because priorities are intended * to match Unix nice values, and systems commonly provide a constant, small * set of nice values. This removes the need for accounting deficit. Next, * round tracking is used to improve the handling of dynamic events : work * scaling is performed only on thread addition, and not when a thread that * was removed is added again during the same round. In addition, since GR3 * is itself a round-robin algorithm, it already provides the feature required * from local scheduling by DWRR, namely round slicing. Consequently, DWRR * doesn't sit "on top" of GR3, but is actually merged with it. The result is * an algorithm that shares the same data for both local scheduling and load * balancing. * * A few terms are used by both papers with slightly different meanings. Here * are the definitions used in this implementation : * - The time unit is the system timer period (1 / tick frequency) * - Work is the amount of execution time units consumed * - Weight is the amount of execution time units allocated * - A round is the shortest period during which all threads in a run queue * consume their allocated time (i.e. their work reaches their weight) * * TODO Sub-tick accounting. * * TODO Setting affinity after thread creation. * * TODO Take into account the underlying CPU topology (and adjust load * balancing to access the global highest round less frequently on large * processor groups, perhaps by applying the load balancing algorithm in a * bottom-up fashion with one highest round per processor group). * * TODO For now, interactivity can not be experimented. The current strategy * is to always add threads in front of their group queue and track rounds * so that they don't get more time than they should. A direct consequence * is that continually spawning threads at short intervals is likely to cause * starvation. This could be fixed by adding newly created threads at the back * of their group queue. For now, don't overengineer, and wait until all this * can actually be tested. * * TODO Review weight computation (it may be more appropriate to determine * weights in a smoother way than a raw scaling). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Preemption level of a suspended thread. * * The expected interrupt, preemption and run queue lock state when * dispatching a thread is : * - interrupts disabled * - preemption disabled * - run queue locked * * Locking the run queue increases the preemption level once more, * making its value 2. */ #define THREAD_SUSPEND_PREEMPT_LEVEL 2 /* * Scheduling classes. * * Classes are sorted by order of priority (lower indexes first). The same * class can apply to several policies. * * The idle class is reserved for the per-CPU idle threads. */ #define THREAD_SCHED_CLASS_RT 0 #define THREAD_SCHED_CLASS_FS 1 #define THREAD_SCHED_CLASS_IDLE 2 #define THREAD_NR_SCHED_CLASSES 3 /* * Global priority bases for each scheduling class. * * Global priorities are only used to determine which of two threads * has the higher priority, and should only matter for priority * inheritance. * * In the current configuration, all fair-scheduling threads have the * same global priority. */ #define THREAD_SCHED_GLOBAL_PRIO_RT 2 #define THREAD_SCHED_GLOBAL_PRIO_FS 1 #define THREAD_SCHED_GLOBAL_PRIO_IDLE 0 /* * Default time slice for real-time round-robin scheduling. */ #define THREAD_DEFAULT_RR_TIME_SLICE (CLOCK_FREQ / 10) /* * Maximum number of threads which can be pulled from a remote run queue * while interrupts are disabled. */ #define THREAD_MAX_MIGRATIONS 16 /* * Delay (in ticks) between two balance attempts when a run queue is idle. */ #define THREAD_IDLE_BALANCE_TICKS (CLOCK_FREQ / 2) /* * Run queue properties for real-time threads. */ struct thread_rt_runq { unsigned int bitmap; struct list threads[THREAD_SCHED_RT_PRIO_MAX + 1]; }; /* * Initial value of the highest round. * * Set to a high value to make sure overflows are correctly handled. */ #define THREAD_FS_INITIAL_ROUND ((unsigned long)-10) /* * Round slice base unit for fair-scheduling threads. */ #define THREAD_FS_ROUND_SLICE_BASE (CLOCK_FREQ / 10) /* * Group of threads sharing the same weight. */ struct thread_fs_group { struct list node; struct list threads; unsigned int weight; unsigned int work; }; /* * Run queue properties for fair-scheduling threads. * * The current group pointer has a valid address only when the run queue isn't * empty. */ struct thread_fs_runq { struct thread_fs_group group_array[THREAD_SCHED_FS_PRIO_MAX + 1]; struct list groups; struct list threads; struct thread_fs_group *current; unsigned int nr_threads; unsigned int weight; unsigned int work; }; /* * Per processor run queue. * * Locking multiple run queues is done in the ascending order of their CPU * identifier. Interrupts must be disabled whenever locking a run queue, even * a remote one, otherwise an interrupt (which invokes the scheduler on its * return path) may violate the locking order. */ struct thread_runq { alignas(CPU_L1_SIZE) struct spinlock lock; unsigned int cpu; unsigned int nr_threads; struct thread *current; /* Real-time related members */ struct thread_rt_runq rt_runq; /* * Fair-scheduling related members. * * The current round is set when the active run queue becomes non-empty. * It's not reset when both run queues become empty. As a result, the * current round has a meaningful value only when at least one thread is * present, i.e. the global weight isn't zero. */ unsigned long fs_round; unsigned int fs_weight; struct thread_fs_runq fs_runqs[2]; struct thread_fs_runq *fs_runq_active; struct thread_fs_runq *fs_runq_expired; struct thread *balancer; struct thread *idler; /* Ticks before the next balancing attempt when a run queue is idle */ unsigned int idle_balance_ticks; struct syscnt sc_schedule_intrs; struct syscnt sc_boosts; }; /* * Operations of a scheduling class. */ struct thread_sched_ops { struct thread_runq * (*select_runq)(struct thread *thread); void (*add)(struct thread_runq *runq, struct thread *thread); void (*remove)(struct thread_runq *runq, struct thread *thread); void (*put_prev)(struct thread_runq *runq, struct thread *thread); struct thread * (*get_next)(struct thread_runq *runq); void (*reset_priority)(struct thread *thread, unsigned short priority); void (*update_priority)(struct thread *thread, unsigned short priority); unsigned int (*get_global_priority)(unsigned short priority); void (*set_next)(struct thread_runq *runq, struct thread *thread); void (*tick)(struct thread_runq *runq, struct thread *thread); }; static struct thread_runq thread_runq __percpu; /* * Statically allocated fake threads that provide thread context to processors * during bootstrap. */ static struct thread thread_booters[CONFIG_MAX_CPUS] __initdata; static struct kmem_cache thread_cache; #ifndef CONFIG_THREAD_STACK_GUARD static struct kmem_cache thread_stack_cache; #endif /* CONFIG_THREAD_STACK_GUARD */ static const unsigned char thread_policy_table[THREAD_NR_SCHED_POLICIES] = { [THREAD_SCHED_POLICY_FIFO] = THREAD_SCHED_CLASS_RT, [THREAD_SCHED_POLICY_RR] = THREAD_SCHED_CLASS_RT, [THREAD_SCHED_POLICY_FS] = THREAD_SCHED_CLASS_FS, [THREAD_SCHED_POLICY_IDLE] = THREAD_SCHED_CLASS_IDLE, }; static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES]; /* * Map of run queues for which a processor is running. */ static struct cpumap thread_active_runqs; /* * Map of idle run queues. * * Access to this map isn't synchronized. It is merely used as a fast hint * to find run queues that are likely to be idle. */ static struct cpumap thread_idle_runqs; /* * System-wide value of the current highest round. * * This global variable is accessed without any synchronization. Its value * being slightly inaccurate doesn't harm the fairness properties of the * scheduling and load balancing algorithms. * * There can be moderate bouncing on this word so give it its own cache line. */ static struct { alignas(CPU_L1_SIZE) volatile unsigned long value; } thread_fs_highest_round_struct; #define thread_fs_highest_round (thread_fs_highest_round_struct.value) /* * Number of TSD keys actually allocated. */ static unsigned int thread_nr_keys __read_mostly; /* * Destructors installed for each key. */ static thread_dtor_fn_t thread_dtors[THREAD_KEYS_MAX] __read_mostly; /* * Number of processors which have requested the scheduler to run. * * This value is used to implement a global barrier across the entire * system at boot time, so that inter-processor requests may not be * lost in case a processor is slower to initialize. */ static unsigned int thread_nr_boot_cpus __initdata; struct thread_zombie { struct work work; struct thread *thread; }; static unsigned char thread_policy_to_class(unsigned char policy) { assert(policy < ARRAY_SIZE(thread_policy_table)); return thread_policy_table[policy]; } static void thread_set_wchan(struct thread *thread, const void *wchan_addr, const char *wchan_desc) { assert((wchan_addr != NULL) && (wchan_desc != NULL)); thread->wchan_addr = wchan_addr; thread->wchan_desc = wchan_desc; } static void thread_clear_wchan(struct thread *thread) { thread->wchan_addr = NULL; thread->wchan_desc = NULL; } static const struct thread_sched_ops * thread_get_sched_ops(unsigned char sched_class) { assert(sched_class < ARRAY_SIZE(thread_sched_ops)); return &thread_sched_ops[sched_class]; } static const struct thread_sched_ops * thread_get_user_sched_ops(const struct thread *thread) { return thread_get_sched_ops(thread_user_sched_class(thread)); } static const struct thread_sched_ops * thread_get_real_sched_ops(const struct thread *thread) { return thread_get_sched_ops(thread_real_sched_class(thread)); } static void __init thread_runq_init_rt(struct thread_runq *runq) { struct thread_rt_runq *rt_runq; size_t i; rt_runq = &runq->rt_runq; rt_runq->bitmap = 0; for (i = 0; i < ARRAY_SIZE(rt_runq->threads); i++) { list_init(&rt_runq->threads[i]); } } static void __init thread_fs_group_init(struct thread_fs_group *group) { list_init(&group->threads); group->weight = 0; group->work = 0; } static void __init thread_fs_runq_init(struct thread_fs_runq *fs_runq) { size_t i; for (i = 0; i < ARRAY_SIZE(fs_runq->group_array); i++) { thread_fs_group_init(&fs_runq->group_array[i]); } list_init(&fs_runq->groups); list_init(&fs_runq->threads); fs_runq->nr_threads = 0; fs_runq->weight = 0; fs_runq->work = 0; } static void __init thread_runq_init_fs(struct thread_runq *runq) { runq->fs_weight = 0; runq->fs_runq_active = &runq->fs_runqs[0]; runq->fs_runq_expired = &runq->fs_runqs[1]; thread_fs_runq_init(runq->fs_runq_active); thread_fs_runq_init(runq->fs_runq_expired); } static void __init thread_runq_init(struct thread_runq *runq, unsigned int cpu, struct thread *booter) { char name[SYSCNT_NAME_SIZE]; spinlock_init(&runq->lock); runq->cpu = cpu; runq->nr_threads = 0; runq->current = booter; thread_runq_init_rt(runq); thread_runq_init_fs(runq); runq->balancer = NULL; runq->idler = NULL; runq->idle_balance_ticks = (unsigned int)-1; snprintf(name, sizeof(name), "thread_schedule_intrs/%u", cpu); syscnt_register(&runq->sc_schedule_intrs, name); snprintf(name, sizeof(name), "thread_boosts/%u", cpu); syscnt_register(&runq->sc_boosts, name); } static inline struct thread_runq * thread_runq_local(void) { assert(!thread_preempt_enabled() || thread_pinned()); return cpu_local_ptr(thread_runq); } static inline unsigned int thread_runq_cpu(struct thread_runq *runq) { return runq->cpu; } static void thread_runq_add(struct thread_runq *runq, struct thread *thread) { const struct thread_sched_ops *ops; assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); assert(!thread->in_runq); ops = thread_get_real_sched_ops(thread); ops->add(runq, thread); if (runq->nr_threads == 0) { cpumap_clear_atomic(&thread_idle_runqs, thread_runq_cpu(runq)); } runq->nr_threads++; if (thread_real_sched_class(thread) < thread_real_sched_class(runq->current)) { thread_set_flag(runq->current, THREAD_YIELD); } atomic_store(&thread->runq, runq, ATOMIC_RELAXED); thread->in_runq = true; } static void thread_runq_remove(struct thread_runq *runq, struct thread *thread) { const struct thread_sched_ops *ops; assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); assert(thread->in_runq); runq->nr_threads--; if (runq->nr_threads == 0) { cpumap_set_atomic(&thread_idle_runqs, thread_runq_cpu(runq)); } ops = thread_get_real_sched_ops(thread); ops->remove(runq, thread); thread->in_runq = false; } static void thread_runq_put_prev(struct thread_runq *runq, struct thread *thread) { const struct thread_sched_ops *ops; assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); ops = thread_get_real_sched_ops(thread); if (ops->put_prev != NULL) { ops->put_prev(runq, thread); } } static struct thread * thread_runq_get_next(struct thread_runq *runq) { struct thread *thread; unsigned int i; assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); for (i = 0; i < ARRAY_SIZE(thread_sched_ops); i++) { thread = thread_sched_ops[i].get_next(runq); if (thread != NULL) { atomic_store(&runq->current, thread, ATOMIC_RELAXED); return thread; } } /* The idle class should never be empty */ panic("thread: unable to find next thread"); } static void thread_runq_set_next(struct thread_runq *runq, struct thread *thread) { const struct thread_sched_ops *ops; ops = thread_get_real_sched_ops(thread); if (ops->set_next != NULL) { ops->set_next(runq, thread); } atomic_store(&runq->current, thread, ATOMIC_RELAXED); } static void thread_runq_wakeup(struct thread_runq *runq, struct thread *thread) { assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); assert(thread->state == THREAD_RUNNING); thread_runq_add(runq, thread); if ((runq != thread_runq_local()) && thread_test_flag(runq->current, THREAD_YIELD)) { cpu_send_thread_schedule(thread_runq_cpu(runq)); } } static void thread_runq_wakeup_balancer(struct thread_runq *runq) { if (runq->balancer->state == THREAD_RUNNING) { return; } thread_clear_wchan(runq->balancer); runq->balancer->state = THREAD_RUNNING; thread_runq_wakeup(runq, runq->balancer); } static void thread_runq_schedule_load(struct thread *thread) { pmap_load(thread->task->map->pmap); #ifdef CONFIG_PERFMON perfmon_thread_load(thread); #endif } static void thread_runq_schedule_unload(struct thread *thread) { #ifdef CONFIG_PERFMON perfmon_thread_unload(thread); #else (void)thread; #endif } static struct thread_runq * thread_runq_schedule(struct thread_runq *runq) { struct thread *prev, *next; prev = thread_self(); assert((__builtin_frame_address(0) >= prev->stack) && (__builtin_frame_address(0) < (prev->stack + TCB_STACK_SIZE))); assert(prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL); assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); thread_clear_flag(prev, THREAD_YIELD); thread_runq_put_prev(runq, prev); if (prev->state != THREAD_RUNNING) { thread_runq_remove(runq, prev); if ((runq->nr_threads == 0) && (prev != runq->balancer)) { thread_runq_wakeup_balancer(runq); } } next = thread_runq_get_next(runq); assert((next != runq->idler) || (runq->nr_threads == 0)); assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL); if (likely(prev != next)) { thread_runq_schedule_unload(prev); rcu_report_context_switch(thread_rcu_reader(prev)); spinlock_transfer_owner(&runq->lock, next); /* * That's where the true context switch occurs. The next thread must * unlock the run queue and reenable preemption. Note that unlocking * and locking the run queue again is equivalent to a full memory * barrier. */ tcb_switch(&prev->tcb, &next->tcb); /* * The thread is dispatched on a processor once again. * * Keep in mind the system state may have changed a lot since this * function was called. In particular : * - The next thread may have been destroyed, and must not be * referenced any more. * - The current thread may have been migrated to another processor. */ barrier(); thread_runq_schedule_load(prev); next = NULL; runq = thread_runq_local(); } else { next = NULL; } assert(prev->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL); assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); return runq; } static void thread_runq_double_lock(struct thread_runq *a, struct thread_runq *b) { assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); assert(a != b); if (a->cpu < b->cpu) { spinlock_lock(&a->lock); spinlock_lock(&b->lock); } else { spinlock_lock(&b->lock); spinlock_lock(&a->lock); } } static struct thread_runq * thread_sched_rt_select_runq(struct thread *thread) { struct thread_runq *runq; int i; /* * Real-time tasks are commonly configured to run on one specific * processor only. */ i = cpumap_find_first(&thread->cpumap); assert(i >= 0); assert(cpumap_test(&thread_active_runqs, i)); runq = percpu_ptr(thread_runq, i); spinlock_lock(&runq->lock); return runq; } static void thread_sched_rt_add(struct thread_runq *runq, struct thread *thread) { struct thread_rt_runq *rt_runq; struct list *threads; rt_runq = &runq->rt_runq; threads = &rt_runq->threads[thread_real_priority(thread)]; list_insert_tail(threads, &thread->rt_data.node); if (list_singular(threads)) { rt_runq->bitmap |= (1ULL << thread_real_priority(thread)); } if ((thread_real_sched_class(thread) == thread_real_sched_class(runq->current)) && (thread_real_priority(thread) > thread_real_priority(runq->current))) { thread_set_flag(runq->current, THREAD_YIELD); } } static void thread_sched_rt_remove(struct thread_runq *runq, struct thread *thread) { struct thread_rt_runq *rt_runq; struct list *threads; rt_runq = &runq->rt_runq; threads = &rt_runq->threads[thread_real_priority(thread)]; list_remove(&thread->rt_data.node); if (list_empty(threads)) { rt_runq->bitmap &= ~(1ULL << thread_real_priority(thread)); } } static void thread_sched_rt_put_prev(struct thread_runq *runq, struct thread *thread) { thread_sched_rt_add(runq, thread); } static struct thread * thread_sched_rt_get_next(struct thread_runq *runq) { struct thread_rt_runq *rt_runq; struct thread *thread; struct list *threads; unsigned int priority; rt_runq = &runq->rt_runq; if (rt_runq->bitmap == 0) { return NULL; } priority = THREAD_SCHED_RT_PRIO_MAX - __builtin_clz(rt_runq->bitmap); threads = &rt_runq->threads[priority]; assert(!list_empty(threads)); thread = list_first_entry(threads, struct thread, rt_data.node); thread_sched_rt_remove(runq, thread); return thread; } static void thread_sched_rt_reset_priority(struct thread *thread, unsigned short priority) { assert(priority <= THREAD_SCHED_RT_PRIO_MAX); thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE; } static unsigned int thread_sched_rt_get_global_priority(unsigned short priority) { return THREAD_SCHED_GLOBAL_PRIO_RT + priority; } static void thread_sched_rt_set_next(struct thread_runq *runq, struct thread *thread) { thread_sched_rt_remove(runq, thread); } static void thread_sched_rt_tick(struct thread_runq *runq, struct thread *thread) { (void)runq; if (thread_real_sched_policy(thread) != THREAD_SCHED_POLICY_RR) { return; } thread->rt_data.time_slice--; if (thread->rt_data.time_slice > 0) { return; } thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE; thread_set_flag(thread, THREAD_YIELD); } static inline unsigned short thread_sched_fs_prio2weight(unsigned short priority) { return ((priority + 1) * THREAD_FS_ROUND_SLICE_BASE); } static struct thread_runq * thread_sched_fs_select_runq(struct thread *thread) { struct thread_runq *runq, *tmp; long delta; int i; cpumap_for_each(&thread_idle_runqs, i) { if (!cpumap_test(&thread->cpumap, i)) { continue; } runq = percpu_ptr(thread_runq, i); spinlock_lock(&runq->lock); /* The run queue really is idle, return it */ if (runq->current == runq->idler) { goto out; } spinlock_unlock(&runq->lock); } runq = NULL; cpumap_for_each(&thread_active_runqs, i) { if (!cpumap_test(&thread->cpumap, i)) { continue; } tmp = percpu_ptr(thread_runq, i); spinlock_lock(&tmp->lock); if (runq == NULL) { runq = tmp; continue; } /* A run queue may have become idle */ if (tmp->current == tmp->idler) { spinlock_unlock(&runq->lock); runq = tmp; goto out; } /* * The run queue isn't idle, but there are no fair-scheduling thread, * which means there are real-time threads. */ if (tmp->fs_weight == 0) { spinlock_unlock(&tmp->lock); continue; } delta = (long)(tmp->fs_round - runq->fs_round); /* Look for the least loaded of the run queues in the highest round */ if ((delta > 0) || ((delta == 0) && (tmp->fs_weight < runq->fs_weight))) { spinlock_unlock(&runq->lock); runq = tmp; continue; } spinlock_unlock(&tmp->lock); } assert(runq != NULL); out: return runq; } static unsigned int thread_sched_fs_enqueue_scale(unsigned int work, unsigned int old_weight, unsigned int new_weight) { assert(old_weight != 0); #ifndef __LP64__ if (likely((work < 0x10000) && (new_weight < 0x10000))) { return (work * new_weight) / old_weight; } #endif /* __LP64__ */ return (unsigned int)(((unsigned long long)work * new_weight) / old_weight); } static void thread_sched_fs_enqueue(struct thread_fs_runq *fs_runq, unsigned long round, struct thread *thread) { struct thread_fs_group *group, *tmp; struct list *node, *init_node; unsigned int group_weight, total_weight; assert(thread->fs_data.fs_runq == NULL); assert(thread->fs_data.work <= thread->fs_data.weight); group = &fs_runq->group_array[thread_real_priority(thread)]; group_weight = group->weight + thread->fs_data.weight; total_weight = fs_runq->weight + thread->fs_data.weight; node = (group->weight == 0) ? list_last(&fs_runq->groups) : list_prev(&group->node); init_node = node; while (!list_end(&fs_runq->groups, node)) { tmp = list_entry(node, struct thread_fs_group, node); if (tmp->weight >= group_weight) { break; } node = list_prev(node); } if (group->weight == 0) { list_insert_after(&group->node, node); } else if (node != init_node) { list_remove(&group->node); list_insert_after(&group->node, node); } /* * XXX Unfairness can occur if the run queue round wraps around and the * thread is "lucky" enough to have the same round value. This should be * rare and harmless otherwise. */ if (thread->fs_data.round == round) { fs_runq->work += thread->fs_data.work; group->work += thread->fs_data.work; } else { unsigned int group_work, thread_work; if (fs_runq->weight == 0) { thread_work = 0; } else { group_work = (group->weight == 0) ? thread_sched_fs_enqueue_scale(fs_runq->work, fs_runq->weight, thread->fs_data.weight) : thread_sched_fs_enqueue_scale(group->work, group->weight, group_weight); thread_work = group_work - group->work; fs_runq->work += thread_work; group->work = group_work; } thread->fs_data.round = round; thread->fs_data.work = thread_work; } fs_runq->nr_threads++; fs_runq->weight = total_weight; group->weight = group_weight; /* Insert at the front of the group to improve interactivity */ list_insert_head(&group->threads, &thread->fs_data.group_node); list_insert_tail(&fs_runq->threads, &thread->fs_data.runq_node); thread->fs_data.fs_runq = fs_runq; } static void thread_sched_fs_restart(struct thread_runq *runq) { struct thread_fs_runq *fs_runq; struct list *node; fs_runq = runq->fs_runq_active; node = list_first(&fs_runq->groups); assert(node != NULL); fs_runq->current = list_entry(node, struct thread_fs_group, node); if (thread_real_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) { thread_set_flag(runq->current, THREAD_YIELD); } } static void thread_sched_fs_add(struct thread_runq *runq, struct thread *thread) { unsigned int total_weight; if (runq->fs_weight == 0) { runq->fs_round = thread_fs_highest_round; } total_weight = runq->fs_weight + thread->fs_data.weight; /* TODO Limit the maximum number of threads to prevent this situation */ if (total_weight < runq->fs_weight) { panic("thread: weight overflow"); } runq->fs_weight = total_weight; thread_sched_fs_enqueue(runq->fs_runq_active, runq->fs_round, thread); thread_sched_fs_restart(runq); } static void thread_sched_fs_dequeue(struct thread *thread) { struct thread_fs_runq *fs_runq; struct thread_fs_group *group, *tmp; struct list *node, *init_node; assert(thread->fs_data.fs_runq != NULL); fs_runq = thread->fs_data.fs_runq; group = &fs_runq->group_array[thread_real_priority(thread)]; thread->fs_data.fs_runq = NULL; list_remove(&thread->fs_data.runq_node); list_remove(&thread->fs_data.group_node); fs_runq->work -= thread->fs_data.work; group->work -= thread->fs_data.work; fs_runq->weight -= thread->fs_data.weight; group->weight -= thread->fs_data.weight; fs_runq->nr_threads--; if (group->weight == 0) { list_remove(&group->node); } else { node = list_next(&group->node); init_node = node; while (!list_end(&fs_runq->groups, node)) { tmp = list_entry(node, struct thread_fs_group, node); if (tmp->weight <= group->weight) { break; } node = list_next(node); } if (node != init_node) { list_remove(&group->node); list_insert_before(&group->node, node); } } } static void thread_sched_fs_remove(struct thread_runq *runq, struct thread *thread) { struct thread_fs_runq *fs_runq; runq->fs_weight -= thread->fs_data.weight; fs_runq = thread->fs_data.fs_runq; thread_sched_fs_dequeue(thread); if (fs_runq == runq->fs_runq_active) { if (fs_runq->nr_threads == 0) { thread_runq_wakeup_balancer(runq); } else { thread_sched_fs_restart(runq); } } } static void thread_sched_fs_deactivate(struct thread_runq *runq, struct thread *thread) { assert(thread->fs_data.fs_runq == runq->fs_runq_active); assert(thread->fs_data.round == runq->fs_round); thread_sched_fs_dequeue(thread); thread->fs_data.round++; thread->fs_data.work -= thread->fs_data.weight; thread_sched_fs_enqueue(runq->fs_runq_expired, runq->fs_round + 1, thread); if (runq->fs_runq_active->nr_threads == 0) { thread_runq_wakeup_balancer(runq); } } static void thread_sched_fs_put_prev(struct thread_runq *runq, struct thread *thread) { struct thread_fs_runq *fs_runq; struct thread_fs_group *group; fs_runq = runq->fs_runq_active; group = &fs_runq->group_array[thread_real_priority(thread)]; list_insert_tail(&group->threads, &thread->fs_data.group_node); if (thread->fs_data.work >= thread->fs_data.weight) { thread_sched_fs_deactivate(runq, thread); } } static int thread_sched_fs_ratio_exceeded(struct thread_fs_group *current, struct thread_fs_group *next) { unsigned long long a, b; #ifndef __LP64__ unsigned int ia, ib; if (likely((current->weight < 0x10000) && (next->weight < 0x10000))) { ia = (current->work + 1) * next->weight; ib = (next->work + 1) * current->weight; return ia > ib; } #endif /* __LP64__ */ a = ((unsigned long long)current->work + 1) * next->weight; b = ((unsigned long long)next->work + 1) * current->weight; return a > b; } static struct thread * thread_sched_fs_get_next(struct thread_runq *runq) { struct thread_fs_runq *fs_runq; struct thread_fs_group *group, *next; struct thread *thread; struct list *node; fs_runq = runq->fs_runq_active; if (fs_runq->nr_threads == 0) { return NULL; } group = fs_runq->current; node = list_next(&group->node); if (list_end(&fs_runq->groups, node)) { node = list_first(&fs_runq->groups); group = list_entry(node, struct thread_fs_group, node); } else { next = list_entry(node, struct thread_fs_group, node); if (thread_sched_fs_ratio_exceeded(group, next)) { group = next; } else { node = list_first(&fs_runq->groups); group = list_entry(node, struct thread_fs_group, node); } } fs_runq->current = group; node = list_first(&group->threads); thread = list_entry(node, struct thread, fs_data.group_node); list_remove(node); return thread; } static void thread_sched_fs_reset_priority(struct thread *thread, unsigned short priority) { assert(priority <= THREAD_SCHED_FS_PRIO_MAX); thread->fs_data.fs_runq = NULL; thread->fs_data.round = 0; thread->fs_data.weight = thread_sched_fs_prio2weight(priority); thread->fs_data.work = 0; } static void thread_sched_fs_update_priority(struct thread *thread, unsigned short priority) { assert(priority <= THREAD_SCHED_FS_PRIO_MAX); thread->fs_data.weight = thread_sched_fs_prio2weight(priority); if (thread->fs_data.work >= thread->fs_data.weight) { thread->fs_data.work = thread->fs_data.weight; } } static unsigned int thread_sched_fs_get_global_priority(unsigned short priority) { (void)priority; return THREAD_SCHED_GLOBAL_PRIO_FS; } static void thread_sched_fs_set_next(struct thread_runq *runq, struct thread *thread) { (void)runq; list_remove(&thread->fs_data.group_node); } static void thread_sched_fs_tick(struct thread_runq *runq, struct thread *thread) { struct thread_fs_runq *fs_runq; struct thread_fs_group *group; fs_runq = runq->fs_runq_active; fs_runq->work++; group = &fs_runq->group_array[thread_real_priority(thread)]; group->work++; thread_set_flag(thread, THREAD_YIELD); thread->fs_data.work++; } static void thread_sched_fs_start_next_round(struct thread_runq *runq) { struct thread_fs_runq *tmp; long delta; tmp = runq->fs_runq_expired; runq->fs_runq_expired = runq->fs_runq_active; runq->fs_runq_active = tmp; if (runq->fs_runq_active->nr_threads != 0) { runq->fs_round++; delta = (long)(runq->fs_round - thread_fs_highest_round); if (delta > 0) { thread_fs_highest_round = runq->fs_round; } thread_sched_fs_restart(runq); } } /* * Check that a remote run queue satisfies the minimum migration requirements. */ static int thread_sched_fs_balance_eligible(struct thread_runq *runq, unsigned long highest_round) { unsigned int nr_threads; if (runq->fs_weight == 0) { return 0; } if ((runq->fs_round != highest_round) && (runq->fs_round != (highest_round - 1))) { return 0; } nr_threads = runq->fs_runq_active->nr_threads + runq->fs_runq_expired->nr_threads; if ((nr_threads == 0) || ((nr_threads == 1) && (thread_real_sched_class(runq->current) == THREAD_SCHED_CLASS_FS))) { return 0; } return 1; } /* * Try to find the most suitable run queue from which to pull threads. */ static struct thread_runq * thread_sched_fs_balance_scan(struct thread_runq *runq, unsigned long highest_round) { struct thread_runq *remote_runq, *tmp; unsigned long flags; int i; remote_runq = NULL; thread_preempt_disable_intr_save(&flags); cpumap_for_each(&thread_active_runqs, i) { tmp = percpu_ptr(thread_runq, i); if (tmp == runq) { continue; } spinlock_lock(&tmp->lock); if (!thread_sched_fs_balance_eligible(tmp, highest_round)) { spinlock_unlock(&tmp->lock); continue; } if (remote_runq == NULL) { remote_runq = tmp; continue; } if (tmp->fs_weight > remote_runq->fs_weight) { spinlock_unlock(&remote_runq->lock); remote_runq = tmp; continue; } spinlock_unlock(&tmp->lock); } if (remote_runq != NULL) { spinlock_unlock(&remote_runq->lock); } thread_preempt_enable_intr_restore(flags); return remote_runq; } static unsigned int thread_sched_fs_balance_pull(struct thread_runq *runq, struct thread_runq *remote_runq, struct thread_fs_runq *fs_runq, unsigned int nr_pulls) { struct thread *thread, *tmp; int cpu; cpu = thread_runq_cpu(runq); list_for_each_entry_safe(&fs_runq->threads, thread, tmp, fs_data.runq_node) { if (thread == remote_runq->current) { continue; } /* * The pin level is changed without explicit synchronization. * However, it can only be changed by its owning thread. As threads * currently running aren't considered for migration, the thread had * to be preempted and invoke the scheduler. Since balancer threads * acquire the run queue lock, there is strong ordering between * changing the pin level and setting the current thread of a * run queue. * * TODO Review comment. */ if (thread->pin_level != 0) { continue; } if (!cpumap_test(&thread->cpumap, cpu)) { continue; } /* * Make sure at least one thread is pulled if possible. If one or more * thread has already been pulled, take weights into account. */ if ((nr_pulls != 0) && ((runq->fs_weight + thread->fs_data.weight) > (remote_runq->fs_weight - thread->fs_data.weight))) { break; } thread_runq_remove(remote_runq, thread); /* Don't discard the work already accounted for */ thread->fs_data.round = runq->fs_round; thread_runq_add(runq, thread); nr_pulls++; if (nr_pulls == THREAD_MAX_MIGRATIONS) { break; } } return nr_pulls; } static unsigned int thread_sched_fs_balance_migrate(struct thread_runq *runq, struct thread_runq *remote_runq, unsigned long highest_round) { unsigned int nr_pulls; nr_pulls = 0; if (!thread_sched_fs_balance_eligible(remote_runq, highest_round)) { goto out; } nr_pulls = thread_sched_fs_balance_pull(runq, remote_runq, remote_runq->fs_runq_active, 0); if (nr_pulls == THREAD_MAX_MIGRATIONS) { goto out; } /* * Threads in the expired queue of a processor in round highest are * actually in round highest + 1. */ if (remote_runq->fs_round != highest_round) { nr_pulls = thread_sched_fs_balance_pull(runq, remote_runq, remote_runq->fs_runq_expired, nr_pulls); } out: return nr_pulls; } /* * Inter-processor load balancing for fair-scheduling threads. * * Preemption must be disabled, and the local run queue must be locked when * calling this function. If balancing actually occurs, the lock will be * released and preemption enabled when needed. */ static void thread_sched_fs_balance(struct thread_runq *runq, unsigned long *flags) { struct thread_runq *remote_runq; unsigned long highest_round; unsigned int nr_migrations; int i; /* * Grab the highest round now and only use the copy so the value is stable * during the balancing operation. */ highest_round = thread_fs_highest_round; if ((runq->fs_round != highest_round) && (runq->fs_runq_expired->nr_threads != 0)) { goto no_migration; } spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); remote_runq = thread_sched_fs_balance_scan(runq, highest_round); if (remote_runq != NULL) { thread_preempt_disable_intr_save(flags); thread_runq_double_lock(runq, remote_runq); nr_migrations = thread_sched_fs_balance_migrate(runq, remote_runq, highest_round); spinlock_unlock(&remote_runq->lock); if (nr_migrations != 0) { return; } spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); } /* * The scan or the migration failed. As a fallback, make another, simpler * pass on every run queue, and stop as soon as at least one thread could * be successfully pulled. */ cpumap_for_each(&thread_active_runqs, i) { remote_runq = percpu_ptr(thread_runq, i); if (remote_runq == runq) { continue; } thread_preempt_disable_intr_save(flags); thread_runq_double_lock(runq, remote_runq); nr_migrations = thread_sched_fs_balance_migrate(runq, remote_runq, highest_round); spinlock_unlock(&remote_runq->lock); if (nr_migrations != 0) { return; } spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); } thread_preempt_disable(); spinlock_lock_intr_save(&runq->lock, flags); no_migration: /* * No thread could be migrated. Check the active run queue, as another * processor might have added threads while the balancer was running. * If the run queue is still empty, switch to the next round. The run * queue lock must remain held until the next scheduling decision to * prevent a remote balancer thread from stealing active threads. */ if (runq->fs_runq_active->nr_threads == 0) { thread_sched_fs_start_next_round(runq); } } static struct thread_runq * thread_sched_idle_select_runq(struct thread *thread) { (void)thread; panic("thread: idler threads cannot be awaken"); } static noreturn void thread_sched_idle_panic(void) { panic("thread: only idle threads are allowed in the idle class"); } static void thread_sched_idle_add(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; thread_sched_idle_panic(); } static void thread_sched_idle_remove(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; thread_sched_idle_panic(); } static struct thread * thread_sched_idle_get_next(struct thread_runq *runq) { return runq->idler; } static unsigned int thread_sched_idle_get_global_priority(unsigned short priority) { (void)priority; return THREAD_SCHED_GLOBAL_PRIO_IDLE; } static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] = { [THREAD_SCHED_CLASS_RT] = { .select_runq = thread_sched_rt_select_runq, .add = thread_sched_rt_add, .remove = thread_sched_rt_remove, .put_prev = thread_sched_rt_put_prev, .get_next = thread_sched_rt_get_next, .reset_priority = thread_sched_rt_reset_priority, .update_priority = NULL, .get_global_priority = thread_sched_rt_get_global_priority, .set_next = thread_sched_rt_set_next, .tick = thread_sched_rt_tick, }, [THREAD_SCHED_CLASS_FS] = { .select_runq = thread_sched_fs_select_runq, .add = thread_sched_fs_add, .remove = thread_sched_fs_remove, .put_prev = thread_sched_fs_put_prev, .get_next = thread_sched_fs_get_next, .reset_priority = thread_sched_fs_reset_priority, .update_priority = thread_sched_fs_update_priority, .get_global_priority = thread_sched_fs_get_global_priority, .set_next = thread_sched_fs_set_next, .tick = thread_sched_fs_tick, }, [THREAD_SCHED_CLASS_IDLE] = { .select_runq = thread_sched_idle_select_runq, .add = thread_sched_idle_add, .remove = thread_sched_idle_remove, .put_prev = NULL, .get_next = thread_sched_idle_get_next, .reset_priority = NULL, .update_priority = NULL, .get_global_priority = thread_sched_idle_get_global_priority, .set_next = NULL, .tick = NULL, }, }; static void thread_set_user_sched_policy(struct thread *thread, unsigned char sched_policy) { thread->user_sched_data.sched_policy = sched_policy; } static void thread_set_user_sched_class(struct thread *thread, unsigned char sched_class) { thread->user_sched_data.sched_class = sched_class; } static void thread_set_user_priority(struct thread *thread, unsigned short priority) { const struct thread_sched_ops *ops; ops = thread_get_user_sched_ops(thread); thread->user_sched_data.priority = priority; thread->user_sched_data.global_priority = ops->get_global_priority(priority); } static void thread_update_user_priority(struct thread *thread, unsigned short priority) { thread_set_user_priority(thread, priority); } static void thread_set_real_sched_policy(struct thread *thread, unsigned char sched_policy) { thread->real_sched_data.sched_policy = sched_policy; } static void thread_set_real_sched_class(struct thread *thread, unsigned char sched_class) { thread->real_sched_data.sched_class = sched_class; } static void thread_set_real_priority(struct thread *thread, unsigned short priority) { const struct thread_sched_ops *ops; ops = thread_get_real_sched_ops(thread); thread->real_sched_data.priority = priority; thread->real_sched_data.global_priority = ops->get_global_priority(priority); if (ops->reset_priority != NULL) { ops->reset_priority(thread, priority); } } static void thread_update_real_priority(struct thread *thread, unsigned short priority) { const struct thread_sched_ops *ops; ops = thread_get_real_sched_ops(thread); thread->real_sched_data.priority = priority; thread->real_sched_data.global_priority = ops->get_global_priority(priority); if (ops->update_priority != NULL) { ops->update_priority(thread, priority); } } static void thread_reset_real_priority(struct thread *thread) { const struct thread_sched_ops *ops; struct thread_sched_data *user, *real; user = &thread->user_sched_data; real = &thread->real_sched_data; *real = *user; thread->boosted = false; ops = thread_get_user_sched_ops(thread); if (ops->reset_priority != NULL) { ops->reset_priority(thread, real->priority); } } static void __init thread_init_booter(unsigned int cpu) { struct thread *booter; /* Initialize only what's needed during bootstrap */ booter = &thread_booters[cpu]; booter->nr_refs = 0; /* Make sure booters aren't destroyed */ booter->flags = 0; booter->intr_level = 0; booter->preempt_level = 1; rcu_reader_init(&booter->rcu_reader); cpumap_fill(&booter->cpumap); thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE); thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE); thread_set_user_priority(booter, 0); thread_reset_real_priority(booter); memset(booter->tsd, 0, sizeof(booter->tsd)); booter->task = task_get_kernel_task(); snprintf(booter->name, sizeof(booter->name), THREAD_KERNEL_PREFIX "thread_boot/%u", cpu); } static int __init thread_setup_booter(void) { tcb_set_current(&thread_booters[0].tcb); thread_init_booter(0); return 0; } INIT_OP_DEFINE(thread_setup_booter, INIT_OP_DEP(tcb_setup, true)); static int __init thread_bootstrap(void) { cpumap_zero(&thread_active_runqs); cpumap_zero(&thread_idle_runqs); thread_fs_highest_round = THREAD_FS_INITIAL_ROUND; cpumap_set(&thread_active_runqs, 0); thread_runq_init(cpu_local_ptr(thread_runq), 0, &thread_booters[0]); return 0; } INIT_OP_DEFINE(thread_bootstrap, INIT_OP_DEP(syscnt_setup, true), INIT_OP_DEP(thread_setup_booter, true)); void thread_main(void (*fn)(void *), void *arg) { struct thread *thread; assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); thread = thread_self(); thread_runq_schedule_load(thread); spinlock_unlock(&thread_runq_local()->lock); cpu_intr_enable(); thread_preempt_enable(); fn(arg); thread_exit(); } static void thread_destroy_tsd(struct thread *thread) { void *ptr; unsigned int i; i = 0; while (i < thread_nr_keys) { if ((thread->tsd[i] == NULL) || (thread_dtors[i] == NULL)) { i++; continue; } /* * Follow the POSIX description of TSD: set the key to NULL before * calling the destructor and repeat as long as it's not NULL. */ ptr = thread->tsd[i]; thread->tsd[i] = NULL; thread_dtors[i](ptr); if (thread->tsd[i] == NULL) { i++; } } } static int thread_init(struct thread *thread, void *stack, const struct thread_attr *attr, void (*fn)(void *), void *arg) { struct thread *caller; struct task *task; struct cpumap *cpumap; int error; caller = thread_self(); task = (attr->task == NULL) ? caller->task : attr->task; cpumap = (attr->cpumap == NULL) ? &caller->cpumap : attr->cpumap; assert(attr->policy < ARRAY_SIZE(thread_policy_table)); thread->nr_refs = 1; thread->flags = 0; thread->runq = NULL; thread->in_runq = false; thread_set_wchan(thread, thread, "init"); thread->state = THREAD_SLEEPING; thread->priv_sleepq = sleepq_create(); if (thread->priv_sleepq == NULL) { error = ENOMEM; goto error_sleepq; } thread->priv_turnstile = turnstile_create(); if (thread->priv_turnstile == NULL) { error = ENOMEM; goto error_turnstile; } turnstile_td_init(&thread->turnstile_td); thread->propagate_priority = false; thread->preempt_level = THREAD_SUSPEND_PREEMPT_LEVEL; thread->pin_level = 0; thread->intr_level = 0; rcu_reader_init(&thread->rcu_reader); cpumap_copy(&thread->cpumap, cpumap); thread_set_user_sched_policy(thread, attr->policy); thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy)); thread_set_user_priority(thread, attr->priority); thread_reset_real_priority(thread); memset(thread->tsd, 0, sizeof(thread->tsd)); thread->join_waiter = NULL; spinlock_init(&thread->join_lock); thread->terminating = false; thread->task = task; thread->stack = stack; strlcpy(thread->name, attr->name, sizeof(thread->name)); if (attr->flags & THREAD_ATTR_DETACHED) { thread->flags |= THREAD_DETACHED; } #ifdef CONFIG_PERFMON error = perfmon_thread_init(thread); if (error) { goto error_perfmon; } #endif /* CONFIG_PERFMON */ error = tcb_build(&thread->tcb, stack, fn, arg); if (error) { goto error_tcb; } task_add_thread(task, thread); return 0; error_tcb: #ifdef CONFIG_PERFMON perfmon_thread_destroy(thread); error_perfmon: #endif /* CONFIG_PERFMON */ thread_destroy_tsd(thread); turnstile_destroy(thread->priv_turnstile); error_turnstile: sleepq_destroy(thread->priv_sleepq); error_sleepq: return error; } static struct thread_runq * thread_lock_runq(struct thread *thread, unsigned long *flags) { struct thread_runq *runq; for (;;) { runq = atomic_load(&thread->runq, ATOMIC_RELAXED); spinlock_lock_intr_save(&runq->lock, flags); if (runq == atomic_load(&thread->runq, ATOMIC_RELAXED)) { return runq; } spinlock_unlock_intr_restore(&runq->lock, *flags); } } static void thread_unlock_runq(struct thread_runq *runq, unsigned long flags) { spinlock_unlock_intr_restore(&runq->lock, flags); } #ifdef CONFIG_THREAD_STACK_GUARD #include #include #include static void * thread_alloc_stack(void) { struct vm_page *first_page, *last_page; phys_addr_t first_pa, last_pa; struct pmap *kernel_pmap; size_t stack_size; uintptr_t va; void *mem; int error; kernel_pmap = pmap_get_kernel_pmap(); stack_size = vm_page_round(TCB_STACK_SIZE); mem = vm_kmem_alloc((PAGE_SIZE * 2) + stack_size); if (mem == NULL) { return NULL; } va = (uintptr_t)mem; /* * TODO Until memory protection is implemented, use the pmap system * to remove mappings. */ error = pmap_kextract(va, &first_pa); assert(!error); error = pmap_kextract(va + PAGE_SIZE + stack_size, &last_pa); assert(!error); first_page = vm_page_lookup(first_pa); assert(first_page != NULL); last_page = vm_page_lookup(last_pa); assert(last_page != NULL); pmap_remove(kernel_pmap, va, cpumap_all()); pmap_remove(kernel_pmap, va + PAGE_SIZE + stack_size, cpumap_all()); pmap_update(kernel_pmap); return (void *)va + PAGE_SIZE; } static void thread_free_stack(void *stack) { size_t stack_size; void *va; stack_size = vm_page_round(TCB_STACK_SIZE); va = (void *)stack - PAGE_SIZE; vm_kmem_free(va, (PAGE_SIZE * 2) + stack_size); } #else /* CONFIG_THREAD_STACK_GUARD */ static void * thread_alloc_stack(void) { return kmem_cache_alloc(&thread_stack_cache); } static void thread_free_stack(void *stack) { kmem_cache_free(&thread_stack_cache, stack); } #endif /* CONFIG_THREAD_STACK_GUARD */ static void thread_destroy(struct thread *thread) { assert(thread != thread_self()); assert(thread->state == THREAD_DEAD); /* See task_info() */ task_remove_thread(thread->task, thread); #ifdef CONFIG_PERFMON perfmon_thread_destroy(thread); #endif thread_destroy_tsd(thread); turnstile_destroy(thread->priv_turnstile); sleepq_destroy(thread->priv_sleepq); thread_free_stack(thread->stack); tcb_cleanup(&thread->tcb); kmem_cache_free(&thread_cache, thread); } static void thread_join_common(struct thread *thread) { struct thread_runq *runq; unsigned long flags, state; struct thread *self; self = thread_self(); assert(thread != self); spinlock_lock(&thread->join_lock); assert(!thread->join_waiter); thread->join_waiter = self; while (!thread->terminating) { thread_sleep(&thread->join_lock, thread, "exit"); } spinlock_unlock(&thread->join_lock); do { runq = thread_lock_runq(thread, &flags); state = thread->state; thread_unlock_runq(runq, flags); } while (state != THREAD_DEAD); thread_destroy(thread); } void thread_terminate(struct thread *thread) { spinlock_lock(&thread->join_lock); thread->terminating = true; thread_wakeup(thread->join_waiter); spinlock_unlock(&thread->join_lock); } static void thread_balance_idle_tick(struct thread_runq *runq) { assert(runq->idle_balance_ticks != 0); /* * Interrupts can occur early, at a time the balancer thread hasn't been * created yet. */ if (runq->balancer == NULL) { return; } runq->idle_balance_ticks--; if (runq->idle_balance_ticks == 0) { thread_runq_wakeup_balancer(runq); } } static void thread_balance(void *arg) { struct thread_runq *runq; struct thread *self; unsigned long flags; runq = arg; self = runq->balancer; assert(self == runq->balancer); thread_preempt_disable(); spinlock_lock_intr_save(&runq->lock, &flags); for (;;) { runq->idle_balance_ticks = THREAD_IDLE_BALANCE_TICKS; thread_set_wchan(self, runq, "runq"); self->state = THREAD_SLEEPING; runq = thread_runq_schedule(runq); assert(runq == arg); /* * This function may temporarily enable preemption and release the * run queue lock, but on return, the lock must remain held until this * balancer thread sleeps. */ thread_sched_fs_balance(runq, &flags); } } static void __init thread_setup_balancer(struct thread_runq *runq) { char name[THREAD_NAME_SIZE]; struct thread_attr attr; struct thread *balancer; struct cpumap *cpumap; int error; error = cpumap_create(&cpumap); if (error) { panic("thread: unable to create balancer thread CPU map"); } cpumap_zero(cpumap); cpumap_set(cpumap, thread_runq_cpu(runq)); snprintf(name, sizeof(name), THREAD_KERNEL_PREFIX "thread_balance/%u", thread_runq_cpu(runq)); thread_attr_init(&attr, name); thread_attr_set_cpumap(&attr, cpumap); thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO); thread_attr_set_priority(&attr, THREAD_SCHED_RT_PRIO_MIN); error = thread_create(&balancer, &attr, thread_balance, runq); cpumap_destroy(cpumap); if (error) { panic("thread: unable to create balancer thread"); } runq->balancer = balancer; } static void thread_idle(void *arg) { struct thread *self; (void)arg; self = thread_self(); for (;;) { thread_preempt_disable(); for (;;) { cpu_intr_disable(); if (thread_test_flag(self, THREAD_YIELD)) { cpu_intr_enable(); break; } cpu_idle(); } thread_preempt_enable(); } } static void __init thread_setup_idler(struct thread_runq *runq) { char name[THREAD_NAME_SIZE]; struct thread_attr attr; struct thread *idler; struct cpumap *cpumap; void *stack; int error; error = cpumap_create(&cpumap); if (error) { panic("thread: unable to allocate idler thread CPU map"); } cpumap_zero(cpumap); cpumap_set(cpumap, thread_runq_cpu(runq)); idler = kmem_cache_alloc(&thread_cache); if (idler == NULL) { panic("thread: unable to allocate idler thread"); } stack = thread_alloc_stack(); if (stack == NULL) { panic("thread: unable to allocate idler thread stack"); } snprintf(name, sizeof(name), THREAD_KERNEL_PREFIX "thread_idle/%u", thread_runq_cpu(runq)); thread_attr_init(&attr, name); thread_attr_set_cpumap(&attr, cpumap); thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_IDLE); error = thread_init(idler, stack, &attr, thread_idle, NULL); if (error) { panic("thread: unable to initialize idler thread"); } cpumap_destroy(cpumap); /* An idler thread needs special tuning */ thread_clear_wchan(idler); idler->state = THREAD_RUNNING; idler->runq = runq; runq->idler = idler; } static void __init thread_setup_runq(struct thread_runq *runq) { thread_setup_balancer(runq); thread_setup_idler(runq); } #ifdef CONFIG_SHELL /* * This function is meant for debugging only. As a result, it uses a weak * locking policy which allows tracing threads which state may mutate during * tracing. */ static void thread_shell_trace(int argc, char *argv[]) { const char *task_name, *thread_name; struct thread_runq *runq; struct thread *thread; unsigned long flags; struct task *task; int error; if (argc != 3) { error = EINVAL; goto error; } task_name = argv[1]; thread_name = argv[2]; task = task_lookup(task_name); if (task == NULL) { error = ESRCH; goto error; } thread = task_lookup_thread(task, thread_name); task_unref(task); if (thread == NULL) { error = ESRCH; goto error; } runq = thread_lock_runq(thread, &flags); if (thread == runq->current) { printf("thread: trace: thread is running\n"); } else { tcb_trace(&thread->tcb); } thread_unlock_runq(runq, flags); thread_unref(thread); return; error: printf("thread: trace: %s\n", strerror(error)); } static struct shell_cmd thread_shell_cmds[] = { SHELL_CMD_INITIALIZER("thread_trace", thread_shell_trace, "thread_trace ", "display the stack trace of a given thread"), }; static int __init thread_setup_shell(void) { SHELL_REGISTER_CMDS(thread_shell_cmds); return 0; } INIT_OP_DEFINE(thread_setup_shell, INIT_OP_DEP(printf_setup, true), INIT_OP_DEP(shell_setup, true), INIT_OP_DEP(task_setup, true), INIT_OP_DEP(thread_setup, true)); #endif /* CONFIG_SHELL */ static void __init thread_setup_common(unsigned int cpu) { assert(cpu != 0); cpumap_set(&thread_active_runqs, cpu); thread_init_booter(cpu); thread_runq_init(percpu_ptr(thread_runq, cpu), cpu, &thread_booters[cpu]); } static int __init thread_setup(void) { int cpu; for (cpu = 1; (unsigned int)cpu < cpu_count(); cpu++) { thread_setup_common(cpu); } kmem_cache_init(&thread_cache, "thread", sizeof(struct thread), CPU_L1_SIZE, NULL, 0); #ifndef CONFIG_THREAD_STACK_GUARD kmem_cache_init(&thread_stack_cache, "thread_stack", TCB_STACK_SIZE, CPU_DATA_ALIGN, NULL, 0); #endif /* CONFIG_THREAD_STACK_GUARD */ cpumap_for_each(&thread_active_runqs, cpu) { thread_setup_runq(percpu_ptr(thread_runq, cpu)); } return 0; } #ifdef CONFIG_THREAD_STACK_GUARD #define THREAD_STACK_GUARD_INIT_OP_DEPS \ INIT_OP_DEP(vm_kmem_setup, true), \ INIT_OP_DEP(vm_map_setup, true), \ INIT_OP_DEP(vm_page_setup, true), #else /* CONFIG_THREAD_STACK_GUARD */ #define THREAD_STACK_GUARD_INIT_OP_DEPS #endif /* CONFIG_THREAD_STACK_GUARD */ #ifdef CONFIG_PERFMON #define THREAD_PERFMON_INIT_OP_DEPS \ INIT_OP_DEP(perfmon_bootstrap, true), #else /* CONFIG_PERFMON */ #define THREAD_PERFMON_INIT_OP_DEPS #endif /* CONFIG_PERFMON */ INIT_OP_DEFINE(thread_setup, INIT_OP_DEP(cpumap_setup, true), INIT_OP_DEP(kmem_setup, true), INIT_OP_DEP(pmap_setup, true), INIT_OP_DEP(sleepq_setup, true), INIT_OP_DEP(task_setup, true), INIT_OP_DEP(thread_bootstrap, true), INIT_OP_DEP(turnstile_setup, true), THREAD_PERFMON_INIT_OP_DEPS THREAD_STACK_GUARD_INIT_OP_DEPS ); void __init thread_ap_setup(void) { tcb_set_current(&thread_booters[cpu_id()].tcb); } int thread_create(struct thread **threadp, const struct thread_attr *attr, void (*fn)(void *), void *arg) { struct thread *thread; void *stack; int error; if (attr->cpumap != NULL) { error = cpumap_check(attr->cpumap); if (error) { return error; } } thread = kmem_cache_alloc(&thread_cache); if (thread == NULL) { error = ENOMEM; goto error_thread; } stack = thread_alloc_stack(); if (stack == NULL) { error = ENOMEM; goto error_stack; } error = thread_init(thread, stack, attr, fn, arg); if (error) { goto error_init; } /* * The new thread address must be written before the thread is started * in case it's passed to it. */ *threadp = thread; thread_wakeup(thread); return 0; error_init: thread_free_stack(stack); error_stack: kmem_cache_free(&thread_cache, thread); error_thread: return error; } static void thread_reap(struct work *work) { struct thread_zombie *zombie; zombie = structof(work, struct thread_zombie, work); thread_join_common(zombie->thread); } void thread_exit(void) { struct thread_zombie zombie; struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); if (thread_test_flag(thread, THREAD_DETACHED)) { zombie.thread = thread; work_init(&zombie.work, thread_reap); work_schedule(&zombie.work, 0); } /* * Disable preemption before dropping the reference, as this may * trigger the active state poll of the join operation. Doing so * keeps the duration of that active wait minimum. */ thread_preempt_disable(); thread_unref(thread); runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); thread->state = THREAD_DEAD; thread_runq_schedule(runq); panic("thread: dead thread walking"); } void thread_join(struct thread *thread) { assert(!thread_test_flag(thread, THREAD_DETACHED)); thread_join_common(thread); } static int thread_wakeup_common(struct thread *thread, int error) { struct thread_runq *runq; unsigned long flags; if ((thread == NULL) || (thread == thread_self())) { return EINVAL; } /* * There is at most one reference on threads that were never dispatched, * in which case there is no need to lock anything. */ if (thread->runq == NULL) { assert(thread->state != THREAD_RUNNING); thread_clear_wchan(thread); thread->state = THREAD_RUNNING; } else { runq = thread_lock_runq(thread, &flags); if (thread->state == THREAD_RUNNING) { thread_unlock_runq(runq, flags); return EINVAL; } thread_clear_wchan(thread); thread->state = THREAD_RUNNING; thread_unlock_runq(runq, flags); } thread_preempt_disable_intr_save(&flags); if (thread->pin_level == 0) { runq = thread_get_real_sched_ops(thread)->select_runq(thread); } else { /* * This access doesn't need to be atomic, as the current thread is * the only one which may update the member. */ runq = thread->runq; spinlock_lock(&runq->lock); } thread->wakeup_error = error; thread_runq_wakeup(runq, thread); spinlock_unlock(&runq->lock); thread_preempt_enable_intr_restore(flags); return 0; } int thread_wakeup(struct thread *thread) { return thread_wakeup_common(thread, 0); } struct thread_timeout_waiter { struct thread *thread; struct timer timer; }; static void thread_timeout(struct timer *timer) { struct thread_timeout_waiter *waiter; waiter = structof(timer, struct thread_timeout_waiter, timer); thread_wakeup_common(waiter->thread, ETIMEDOUT); } static int thread_sleep_common(struct spinlock *interlock, const void *wchan_addr, const char *wchan_desc, bool timed, uint64_t ticks) { struct thread_timeout_waiter waiter; struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); if (timed) { waiter.thread = thread; timer_init(&waiter.timer, thread_timeout, TIMER_INTR); timer_schedule(&waiter.timer, ticks); } runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); if (interlock != NULL) { thread_preempt_disable(); spinlock_unlock(interlock); } thread_set_wchan(thread, wchan_addr, wchan_desc); thread->state = THREAD_SLEEPING; runq = thread_runq_schedule(runq); assert(thread->state == THREAD_RUNNING); spinlock_unlock_intr_restore(&runq->lock, flags); if (timed) { timer_cancel(&waiter.timer); } if (interlock != NULL) { spinlock_lock(interlock); thread_preempt_enable_no_resched(); } return thread->wakeup_error; } void thread_sleep(struct spinlock *interlock, const void *wchan_addr, const char *wchan_desc) { int error; error = thread_sleep_common(interlock, wchan_addr, wchan_desc, false, 0); assert(!error); } int thread_timedsleep(struct spinlock *interlock, const void *wchan_addr, const char *wchan_desc, uint64_t ticks) { return thread_sleep_common(interlock, wchan_addr, wchan_desc, true, ticks); } void thread_delay(uint64_t ticks, bool absolute) { thread_preempt_disable(); if (!absolute) { /* Add a tick to avoid quantization errors */ ticks += clock_get_time() + 1; } thread_timedsleep(NULL, thread_self(), "delay", ticks); thread_preempt_enable(); } static void __init thread_boot_barrier(void) { unsigned int nr_cpus; assert(!cpu_intr_enabled()); atomic_add(&thread_nr_boot_cpus, 1, ATOMIC_RELAXED); for (;;) { nr_cpus = atomic_load(&thread_nr_boot_cpus, ATOMIC_SEQ_CST); if (nr_cpus == cpu_count()) { break; } cpu_pause(); } } void __init thread_run_scheduler(void) { struct thread_runq *runq; struct thread *thread; assert(!cpu_intr_enabled()); thread_boot_barrier(); runq = thread_runq_local(); thread = thread_self(); assert(thread == runq->current); assert(thread->preempt_level == (THREAD_SUSPEND_PREEMPT_LEVEL - 1)); sref_register(); spinlock_lock(&runq->lock); thread = thread_runq_get_next(thread_runq_local()); spinlock_transfer_owner(&runq->lock, thread); tcb_load(&thread->tcb); } void thread_yield(void) { struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); if (!thread_preempt_enabled()) { return; } do { thread_preempt_disable(); runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); runq = thread_runq_schedule(runq); spinlock_unlock_intr_restore(&runq->lock, flags); thread_preempt_enable_no_resched(); } while (thread_test_flag(thread, THREAD_YIELD)); } void thread_schedule(void) { if (likely(!thread_test_flag(thread_self(), THREAD_YIELD))) { return; } thread_yield(); } void thread_schedule_intr(void) { struct thread_runq *runq; assert(thread_check_intr_context()); runq = thread_runq_local(); syscnt_inc(&runq->sc_schedule_intrs); } void thread_report_periodic_event(void) { const struct thread_sched_ops *ops; struct thread_runq *runq; struct thread *thread; assert(thread_check_intr_context()); runq = thread_runq_local(); thread = thread_self(); spinlock_lock(&runq->lock); if (runq->nr_threads == 0) { thread_balance_idle_tick(runq); } ops = thread_get_real_sched_ops(thread); if (ops->tick != NULL) { ops->tick(runq, thread); } spinlock_unlock(&runq->lock); } unsigned int thread_cpu(const struct thread *thread) { assert(thread->runq); return thread->runq->cpu; } char thread_state_to_chr(const struct thread *thread) { switch (thread->state) { case THREAD_RUNNING: return 'R'; case THREAD_SLEEPING: return 'S'; case THREAD_DEAD: return 'Z'; default: panic("thread: unknown state"); } } const char * thread_sched_class_to_str(unsigned char sched_class) { switch (sched_class) { case THREAD_SCHED_CLASS_RT: return "rt"; case THREAD_SCHED_CLASS_FS: return "fs"; case THREAD_SCHED_CLASS_IDLE: return "idle"; default: panic("thread: unknown scheduling class"); } } void thread_setscheduler(struct thread *thread, unsigned char policy, unsigned short priority) { struct thread_runq *runq; struct turnstile_td *td; unsigned long flags; bool requeue, current, update; td = thread_turnstile_td(thread); turnstile_td_lock(td); runq = thread_lock_runq(thread, &flags); if ((thread_user_sched_policy(thread) == policy) && (thread_user_priority(thread) == priority)) { goto out; } requeue = thread->in_runq; if (!requeue) { current = false; } else { if (thread != runq->current) { current = false; } else { thread_runq_put_prev(runq, thread); current = true; } thread_runq_remove(runq, thread); } if (thread_user_sched_policy(thread) == policy) { thread_update_user_priority(thread, priority); update = true; } else { thread_set_user_sched_policy(thread, policy); thread_set_user_sched_class(thread, thread_policy_to_class(policy)); thread_set_user_priority(thread, priority); update = false; } if (thread->boosted) { if (thread_user_global_priority(thread) >= thread_real_global_priority(thread)) { thread_reset_real_priority(thread); } } else { if (update) { thread_update_real_priority(thread, priority); } else { thread_set_real_sched_policy(thread, policy); thread_set_real_sched_class(thread, thread_policy_to_class(policy)); thread_set_real_priority(thread, priority); } } if (requeue) { thread_runq_add(runq, thread); if (current) { thread_runq_set_next(runq, thread); } } out: thread_unlock_runq(runq, flags); turnstile_td_unlock(td); turnstile_td_propagate_priority(td); } void thread_pi_setscheduler(struct thread *thread, unsigned char policy, unsigned short priority) { const struct thread_sched_ops *ops; struct thread_runq *runq; struct turnstile_td *td; unsigned int global_priority; unsigned long flags; bool requeue, current; td = thread_turnstile_td(thread); turnstile_td_assert_lock(td); ops = thread_get_sched_ops(thread_policy_to_class(policy)); global_priority = ops->get_global_priority(priority); runq = thread_lock_runq(thread, &flags); if ((thread_real_sched_policy(thread) == policy) && (thread_real_priority(thread) == priority)) { goto out; } syscnt_inc(&runq->sc_boosts); requeue = thread->in_runq; if (!requeue) { current = false; } else { if (thread != runq->current) { current = false; } else { thread_runq_put_prev(runq, thread); current = true; } thread_runq_remove(runq, thread); } if (global_priority <= thread_user_global_priority(thread)) { thread_reset_real_priority(thread); } else { if (thread_real_sched_policy(thread) == policy) { thread_update_real_priority(thread, priority); } else { thread_set_real_sched_policy(thread, policy); thread_set_real_sched_class(thread, thread_policy_to_class(policy)); thread_set_real_priority(thread, priority); } thread->boosted = true; } if (requeue) { thread_runq_add(runq, thread); if (current) { thread_runq_set_next(runq, thread); } } out: thread_unlock_runq(runq, flags); } void thread_propagate_priority(void) { struct thread *thread; /* * Although it's possible to propagate priority with preemption * disabled, the operation can be too expensive to allow it. */ if (!thread_preempt_enabled()) { thread_set_priority_propagation_needed(); return; } thread = thread_self(); /* Clear before propagation to avoid infinite recursion */ thread->propagate_priority = false; turnstile_td_propagate_priority(thread_turnstile_td(thread)); } void thread_key_create(unsigned int *keyp, thread_dtor_fn_t dtor) { unsigned int key; key = atomic_fetch_add(&thread_nr_keys, 1, ATOMIC_RELAXED); if (key >= THREAD_KEYS_MAX) { panic("thread: maximum number of keys exceeded"); } thread_dtors[key] = dtor; *keyp = key; } bool thread_is_running(const struct thread *thread) { const struct thread_runq *runq; runq = atomic_load(&thread->runq, ATOMIC_RELAXED); return (runq != NULL) && (atomic_load(&runq->current, ATOMIC_RELAXED) == thread); }