/* * Copyright (c) 2012, 2013 Richard Braun. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * * The scheduling algorithm implemented by this module, named Distributed * Group Ratio Round-Robin (DGR3), is based on the following papers : * - "Group Ratio Round-Robin: O(1) Proportional Share Scheduling for * Uniprocessor and Multiprocessor Systems" by Bogdan Caprita, Wong Chun * Chan, Jason Nieh, Clifford Stein and Haoqiang Zheng. * - "Efficient and Scalable Multiprocessor Fair Scheduling Using Distributed * Weighted Round-Robin" by Tong li, Dan Baumberger and Scott Hahn. * * Note that the Group Ratio Round-Robin (GR3) paper offers a multiprocessor * extension, but based on a single global queue, which strongly limits its * scalability on systems with many processors. That extension isn't used in * this implementation. * * The basic idea is to use GR3 for processor-local scheduling, and Distributed * Weighted Round-Robin (DWRR) for inter-processor load balancing. These * algorithms were chosen for several reasons. To begin with, they provide * fair scheduling, a very desirable property for a modern scheduler. Next, * being based on round-robin, their algorithmic complexity is very low (GR3 * has O(1) scheduling complexity, and O(g) complexity on thread addition * or removal, g being the number of groups, with one group per priority, a * low number in practice). Finally, they're very simple to implement, making * them easy to adjust and maintain. * * Both algorithms are actually slightly modified for efficiency. First, this * version of GR3 is simplified by mapping exactly one group to one priority, * and in turn, one weight. This is possible because priorities are intended * to match Unix nice values, and systems commonly provide a constant, small * set of nice values. This removes the need for accounting deficit. Next, * round tracking is used to improve the handling of dynamic events : work * scaling is performed only on thread addition, and not when a thread that * was removed is added again during the same round. In addition, since GR3 * is itself a round-robin algorithm, it already provides the feature required * from local scheduling by DWRR, namely round slicing. Consequently, DWRR * doesn't sit "on top" of GR3, but is actually merged with it. The result is * an algorithm that shares the same data for both local scheduling and load * balancing. * * A few terms are used by both papers with slightly different meanings. Here * are the definitions used in this implementation : * - The time unit is the system timer period (1 / HZ) * - Work is the amount of execution time units consumed * - Weight is the amount of execution time units allocated * - A round is the shortest period during which all threads in a run queue * consume their allocated time (i.e. their work reaches their weight) * * TODO Sub-tick accounting. * * TODO CPU affinity. * * TODO Take into account the underlying CPU topology (and adjust load * balancing to access the global highest round less frequently on large * processor groups, perhaps by applying the load balancing algorithm in a * bottom-up fashion with one highest round per processor group). * * TODO For now, interactivity can not be experimented. The current strategy * is to always add threads in front of their group queue and track rounds * so that they don't get more time than they should. A direct consequence * is that continually spawning threads at short intervals is likely to cause * starvation. This could be fixed by adding newly created threads at the back * of their group queue. For now, don't overengineer, and wait until all this * can actually be tested. * * TODO Review weight computation (it may be more appropriate to determine * weights in a smoother way than a raw scaling). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Default time slice for real-time round-robin scheduling. */ #define THREAD_DEFAULT_RR_TIME_SLICE (HZ / 10) /* * Maximum number of threads which can be pulled from a remote run queue * while interrupts are disabled. */ #define THREAD_MAX_MIGRATIONS 16 /* * Delay (in ticks) between two balance attempts when a run queue is idle. */ #define THREAD_IDLE_BALANCE_TICKS (HZ / 2) /* * Run queue properties for real-time threads. */ struct thread_rt_runq { unsigned int bitmap; struct list threads[THREAD_SCHED_RT_PRIO_MAX + 1]; }; /* * Initial value of the highest round. * * Set to a high value to make sure overflows are correctly handled. */ #define THREAD_TS_INITIAL_ROUND ((unsigned long)-10) /* * Round slice base unit for time-sharing threads. */ #define THREAD_TS_ROUND_SLICE_BASE (HZ / 10) /* * Group of threads sharing the same weight. */ struct thread_ts_group { struct list node; struct list threads; unsigned int weight; unsigned int work; }; /* * Run queue properties for time-sharing threads. * * The current group pointer has a valid address only when the run queue isn't * empty. */ struct thread_ts_runq { struct thread_ts_group group_array[THREAD_SCHED_TS_PRIO_MAX + 1]; struct list groups; struct list threads; struct thread_ts_group *current; unsigned int nr_threads; unsigned int weight; unsigned int work; }; /* * Per processor run queue. * * Locking multiple run queues is done in the ascending order of their * addresses. Interrupts must be disabled whenever locking a run queue, even * a remote one, otherwise an interrupt (which invokes the scheduler on its * return path) may violate the locking order. */ struct thread_runq { struct spinlock lock; struct thread *current; unsigned int nr_threads; /* Real-time related members */ struct thread_rt_runq rt_runq; /* * Time-sharing related members. * * The current round is set when the active run queue becomes non-empty. * It's not reset when both run queues become empty. As a result, the * current round has a meaningful value only when at least one thread is * present, i.e. the global weight isn't zero. */ unsigned long ts_round; unsigned int ts_weight; struct thread_ts_runq ts_runqs[2]; struct thread_ts_runq *ts_runq_active; struct thread_ts_runq *ts_runq_expired; struct thread *balancer; struct thread *idler; /* Ticks before the next balancing attempt when a run queue is idle */ unsigned int idle_balance_ticks; } __aligned(CPU_L1_SIZE); /* * Operations of a scheduling class. */ struct thread_sched_ops { void (*init_thread)(struct thread *thread, unsigned short priority); struct thread_runq * (*select_runq)(void); void (*add)(struct thread_runq *runq, struct thread *thread); void (*remove)(struct thread_runq *runq, struct thread *thread); void (*put_prev)(struct thread_runq *runq, struct thread *thread); struct thread * (*get_next)(struct thread_runq *runq); void (*tick)(struct thread_runq *runq, struct thread *thread); }; static struct thread_runq thread_runqs[MAX_CPUS]; /* * Statically allocated fake threads that provide thread context to processors * during bootstrap. */ static struct thread thread_booters[MAX_CPUS] __initdata; /* * Caches for allocated threads and their stacks. */ static struct kmem_cache thread_cache; static struct kmem_cache thread_stack_cache; /* * Table used to quickly map policies to classes. */ static unsigned char thread_policy_table[THREAD_NR_SCHED_POLICIES]; /* * Scheduling class operations. */ static struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES]; static struct thread_attr thread_default_attr = { NULL, NULL, THREAD_SCHED_POLICY_TS, THREAD_SCHED_TS_PRIO_DEFAULT }; static BITMAP_DECLARE(thread_active_runqs, MAX_CPUS); /* * System-wide value of the current highest round. * * This global variable is accessed without any synchronization. Its value * being slightly inaccurate doesn't harm the fairness properties of the * scheduling and load balancing algorithms. * * There can be moderate bouncing on this word so give it its own cache line. */ static struct { volatile unsigned long value __aligned(CPU_L1_SIZE); } thread_ts_highest_round_struct; #define thread_ts_highest_round (thread_ts_highest_round_struct.value) /* * List of threads pending for destruction by the reaper. */ static struct mutex thread_reap_lock; static struct condition thread_reap_condition; static struct list thread_reap_list; struct thread_reap_waiter { struct list node; struct thread *thread; }; static void __init thread_runq_init_rt(struct thread_runq *runq) { struct thread_rt_runq *rt_runq; size_t i; rt_runq = &runq->rt_runq; rt_runq->bitmap = 0; for (i = 0; i < ARRAY_SIZE(rt_runq->threads); i++) list_init(&rt_runq->threads[i]); } static void __init thread_ts_group_init(struct thread_ts_group *group) { list_init(&group->threads); group->weight = 0; group->work = 0; } static void __init thread_ts_runq_init(struct thread_ts_runq *ts_runq) { size_t i; for (i = 0; i < ARRAY_SIZE(ts_runq->group_array); i++) thread_ts_group_init(&ts_runq->group_array[i]); list_init(&ts_runq->groups); list_init(&ts_runq->threads); ts_runq->nr_threads = 0; ts_runq->weight = 0; ts_runq->work = 0; } static void __init thread_runq_init_ts(struct thread_runq *runq) { runq->ts_weight = 0; runq->ts_runq_active = &runq->ts_runqs[0]; runq->ts_runq_expired = &runq->ts_runqs[1]; thread_ts_runq_init(runq->ts_runq_active); thread_ts_runq_init(runq->ts_runq_expired); } static void __init thread_runq_init(struct thread_runq *runq, struct thread *booter) { spinlock_init(&runq->lock); runq->current = booter; runq->nr_threads = 0; thread_runq_init_rt(runq); thread_runq_init_ts(runq); runq->balancer = NULL; runq->idler = NULL; runq->idle_balance_ticks = (unsigned int)-1; } static inline unsigned int thread_runq_id(struct thread_runq *runq) { return runq - thread_runqs; } static inline struct thread_runq * thread_runq_local(void) { assert(!thread_preempt_enabled() || thread_pinned()); return &thread_runqs[cpu_id()]; } static inline void thread_set_flag(struct thread *thread, unsigned long flag) { atomic_or(&thread->flags, flag); } static inline void thread_clear_flag(struct thread *thread, unsigned long flag) { atomic_and(&thread->flags, ~flag); } static inline int thread_test_flag(struct thread *thread, unsigned long flag) { barrier(); return ((thread->flags & flag) != 0); } static void thread_runq_add(struct thread_runq *runq, struct thread *thread) { assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); thread_sched_ops[thread->sched_class].add(runq, thread); if (runq->nr_threads == 0) bitmap_set_atomic(thread_active_runqs, thread_runq_id(runq)); runq->nr_threads++; if (thread->sched_class < runq->current->sched_class) thread_set_flag(runq->current, THREAD_RESCHEDULE); thread->runq = runq; } static void thread_runq_remove(struct thread_runq *runq, struct thread *thread) { assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); runq->nr_threads--; if (runq->nr_threads == 0) bitmap_clear_atomic(thread_active_runqs, thread_runq_id(runq)); thread_sched_ops[thread->sched_class].remove(runq, thread); } static void thread_runq_put_prev(struct thread_runq *runq, struct thread *thread) { assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); thread_sched_ops[thread->sched_class].put_prev(runq, thread); } static struct thread * thread_runq_get_next(struct thread_runq *runq) { struct thread *thread; unsigned int i; assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); for (i = 0; i < ARRAY_SIZE(thread_sched_ops); i++) { thread = thread_sched_ops[i].get_next(runq); if (thread != NULL) { runq->current = thread; return thread; } } /* The idle class should never be empty */ panic("thread: unable to find next thread"); } static void thread_runq_wakeup(struct thread_runq *runq, struct thread *thread) { assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); assert(thread->state == THREAD_RUNNING); thread_runq_add(runq, thread); if ((runq != thread_runq_local()) && thread_test_flag(runq->current, THREAD_RESCHEDULE)) { /* * Make the new flags globally visible before sending the * rescheduling request. This barrier pairs with the one implied * by the rescheduling IPI. */ mb_store(); tcb_send_reschedule(thread_runq_id(runq)); } } static void thread_runq_wakeup_balancer(struct thread_runq *runq) { if (runq->balancer->state == THREAD_RUNNING) return; runq->balancer->state = THREAD_RUNNING; thread_runq_wakeup(runq, runq->balancer); } static struct thread_runq * thread_runq_schedule(struct thread_runq *runq, struct thread *prev) { struct thread *next; assert(prev->preempt == 2); assert(!cpu_intr_enabled()); spinlock_assert_locked(&runq->lock); llsync_checkin(thread_runq_id(runq)); thread_clear_flag(prev, THREAD_RESCHEDULE); thread_runq_put_prev(runq, prev); if (prev->state != THREAD_RUNNING) { thread_runq_remove(runq, prev); if ((runq->nr_threads == 0) && (prev != runq->balancer)) thread_runq_wakeup_balancer(runq); } next = thread_runq_get_next(runq); assert((next != runq->idler) || (runq->nr_threads == 0)); if (prev != next) { if ((prev->task != next->task) && (next->task != kernel_task)) pmap_load(next->task->map->pmap); /* * That's where the true context switch occurs. The next thread must * unlock the run queue and reenable preemption. Note that unlocking * and locking the run queue again is equivalent to a full memory * barrier. */ tcb_switch(&prev->tcb, &next->tcb); /* * When dispatched again, the thread might have been moved to another * processor. */ runq = thread_runq_local(); } return runq; } static void thread_runq_double_lock(struct thread_runq *a, struct thread_runq *b) { assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); assert(a != b); if (a < b) { spinlock_lock(&a->lock); spinlock_lock(&b->lock); } else { spinlock_lock(&b->lock); spinlock_lock(&a->lock); } } static void thread_sched_rt_init_thread(struct thread *thread, unsigned short priority) { assert(priority <= THREAD_SCHED_RT_PRIO_MAX); thread->rt_ctx.priority = priority; thread->rt_ctx.time_slice = THREAD_DEFAULT_RR_TIME_SLICE; } static struct thread_runq * thread_sched_rt_select_runq(void) { struct thread_runq *runq; runq = thread_runq_local(); spinlock_lock(&runq->lock); return runq; } static void thread_sched_rt_add(struct thread_runq *runq, struct thread *thread) { struct thread_rt_runq *rt_runq; struct list *threads; rt_runq = &runq->rt_runq; threads = &rt_runq->threads[thread->rt_ctx.priority]; list_insert_tail(threads, &thread->rt_ctx.node); if (list_singular(threads)) rt_runq->bitmap |= (1U << thread->rt_ctx.priority); if ((thread->sched_class == runq->current->sched_class) && (thread->rt_ctx.priority > runq->current->rt_ctx.priority)) thread_set_flag(runq->current, THREAD_RESCHEDULE); } static void thread_sched_rt_remove(struct thread_runq *runq, struct thread *thread) { struct thread_rt_runq *rt_runq; struct list *threads; rt_runq = &runq->rt_runq; threads = &rt_runq->threads[thread->rt_ctx.priority]; list_remove(&thread->rt_ctx.node); if (list_empty(threads)) rt_runq->bitmap &= ~(1U << thread->rt_ctx.priority); } static void thread_sched_rt_put_prev(struct thread_runq *runq, struct thread *thread) { thread_sched_rt_add(runq, thread); } static struct thread * thread_sched_rt_get_next(struct thread_runq *runq) { struct thread_rt_runq *rt_runq; struct thread *thread; struct list *threads; unsigned int priority; rt_runq = &runq->rt_runq; if (rt_runq->bitmap == 0) return NULL; priority = THREAD_SCHED_RT_PRIO_MAX - __builtin_clz(rt_runq->bitmap); threads = &rt_runq->threads[priority]; assert(!list_empty(threads)); thread = list_first_entry(threads, struct thread, rt_ctx.node); thread_sched_rt_remove(runq, thread); return thread; } static void thread_sched_rt_tick(struct thread_runq *runq, struct thread *thread) { (void)runq; if (thread->sched_policy != THREAD_SCHED_POLICY_RR) return; thread->rt_ctx.time_slice--; if (thread->rt_ctx.time_slice > 0) return; thread->rt_ctx.time_slice = THREAD_DEFAULT_RR_TIME_SLICE; thread_set_flag(thread, THREAD_RESCHEDULE); } static inline unsigned short thread_sched_ts_prio2weight(unsigned short priority) { return ((priority + 1) * THREAD_TS_ROUND_SLICE_BASE); } static void thread_sched_ts_init_thread(struct thread *thread, unsigned short priority) { assert(priority <= THREAD_SCHED_TS_PRIO_MAX); thread->ts_ctx.ts_runq = NULL; thread->ts_ctx.round = 0; thread->ts_ctx.priority = priority; thread->ts_ctx.weight = thread_sched_ts_prio2weight(priority); thread->ts_ctx.work = 0; } static struct thread_runq * thread_sched_ts_select_runq(void) { struct thread_runq *runq, *tmp; int i, nr_runqs; long delta; nr_runqs = cpu_count(); bitmap_for_each_zero(thread_active_runqs, nr_runqs, i) { runq = &thread_runqs[i]; spinlock_lock(&runq->lock); /* The run queue really is idle, return it */ if (runq->current == runq->idler) goto out; spinlock_unlock(&runq->lock); } runq = &thread_runqs[0]; spinlock_lock(&runq->lock); for (i = 1; i < nr_runqs; i++) { tmp = &thread_runqs[i]; spinlock_lock(&tmp->lock); /* A run queue may have become idle */ if (tmp->current == tmp->idler) { spinlock_unlock(&runq->lock); runq = tmp; goto out; } /* * The run queue isn't idle, but there are no time-sharing thread, * which means there are real-time threads. */ if (tmp->ts_weight == 0) { spinlock_unlock(&tmp->lock); continue; } delta = (long)(tmp->ts_round - runq->ts_round); /* Look for the least loaded of the run queues in the highest round */ if ((delta > 0) || ((delta == 0) && (tmp->ts_weight < runq->ts_weight))) { spinlock_unlock(&runq->lock); runq = tmp; continue; } spinlock_unlock(&tmp->lock); } out: return runq; } static unsigned int thread_sched_ts_enqueue_scale(unsigned int work, unsigned int old_weight, unsigned int new_weight) { assert(old_weight != 0); #ifndef __LP64__ if (likely((work < 0x10000) && (new_weight < 0x10000))) return (work * new_weight) / old_weight; #endif /* __LP64__ */ return (unsigned int)(((unsigned long long)work * new_weight) / old_weight); } static void thread_sched_ts_enqueue(struct thread_ts_runq *ts_runq, unsigned long round, struct thread *thread) { struct thread_ts_group *group, *tmp; struct list *node, *init_node; unsigned int group_weight, total_weight; assert(thread->ts_ctx.ts_runq == NULL); group = &ts_runq->group_array[thread->ts_ctx.priority]; group_weight = group->weight + thread->ts_ctx.weight; total_weight = ts_runq->weight + thread->ts_ctx.weight; node = (group->weight == 0) ? list_last(&ts_runq->groups) : list_prev(&group->node); init_node = node; while (!list_end(&ts_runq->groups, node)) { tmp = list_entry(node, struct thread_ts_group, node); if (tmp->weight >= group_weight) break; node = list_prev(node); } if (group->weight == 0) list_insert_after(node, &group->node); else if (node != init_node) { list_remove(&group->node); list_insert_after(node, &group->node); } /* * XXX Unfairness can occur if the run queue round wraps around and the * thread is "lucky" enough to have the same round value. This should be * rare and harmless otherwise. */ if (thread->ts_ctx.round == round) { ts_runq->work += thread->ts_ctx.work; group->work += thread->ts_ctx.work; } else { unsigned int group_work, thread_work; if (ts_runq->weight == 0) thread_work = 0; else { group_work = (group->weight == 0) ? thread_sched_ts_enqueue_scale(ts_runq->work, ts_runq->weight, thread->ts_ctx.weight) : thread_sched_ts_enqueue_scale(group->work, group->weight, group_weight); thread_work = group_work - group->work; ts_runq->work += thread_work; group->work = group_work; } thread->ts_ctx.round = round; thread->ts_ctx.work = thread_work; } ts_runq->nr_threads++; ts_runq->weight = total_weight; group->weight = group_weight; /* Insert at the front of the group to improve interactivity */ list_insert(&group->threads, &thread->ts_ctx.group_node); list_insert_tail(&ts_runq->threads, &thread->ts_ctx.runq_node); thread->ts_ctx.ts_runq = ts_runq; } static void thread_sched_ts_restart(struct thread_runq *runq) { struct thread_ts_runq *ts_runq; struct list *node; ts_runq = runq->ts_runq_active; node = list_first(&ts_runq->groups); assert(node != NULL); ts_runq->current = list_entry(node, struct thread_ts_group, node); if (runq->current->sched_class == THREAD_SCHED_CLASS_TS) thread_set_flag(runq->current, THREAD_RESCHEDULE); } static void thread_sched_ts_add(struct thread_runq *runq, struct thread *thread) { unsigned int total_weight; if (runq->ts_weight == 0) runq->ts_round = thread_ts_highest_round; total_weight = runq->ts_weight + thread->ts_ctx.weight; /* TODO Limit the maximum number of threads to prevent this situation */ if (total_weight < runq->ts_weight) panic("thread: weight overflow"); runq->ts_weight = total_weight; thread_sched_ts_enqueue(runq->ts_runq_active, runq->ts_round, thread); thread_sched_ts_restart(runq); } static void thread_sched_ts_dequeue(struct thread *thread) { struct thread_ts_runq *ts_runq; struct thread_ts_group *group, *tmp; struct list *node, *init_node; assert(thread->ts_ctx.ts_runq != NULL); ts_runq = thread->ts_ctx.ts_runq; group = &ts_runq->group_array[thread->ts_ctx.priority]; thread->ts_ctx.ts_runq = NULL; list_remove(&thread->ts_ctx.runq_node); list_remove(&thread->ts_ctx.group_node); ts_runq->work -= thread->ts_ctx.work; group->work -= thread->ts_ctx.work; ts_runq->weight -= thread->ts_ctx.weight; group->weight -= thread->ts_ctx.weight; ts_runq->nr_threads--; if (group->weight == 0) list_remove(&group->node); else { node = list_next(&group->node); init_node = node; while (!list_end(&ts_runq->groups, node)) { tmp = list_entry(node, struct thread_ts_group, node); if (tmp->weight <= group->weight) break; node = list_next(node); } if (node != init_node) { list_remove(&group->node); list_insert_before(node, &group->node); } } } static void thread_sched_ts_remove(struct thread_runq *runq, struct thread *thread) { struct thread_ts_runq *ts_runq; runq->ts_weight -= thread->ts_ctx.weight; ts_runq = thread->ts_ctx.ts_runq; thread_sched_ts_dequeue(thread); if (ts_runq == runq->ts_runq_active) { if (ts_runq->nr_threads == 0) thread_runq_wakeup_balancer(runq); else thread_sched_ts_restart(runq); } } static void thread_sched_ts_deactivate(struct thread_runq *runq, struct thread *thread) { assert(thread->ts_ctx.ts_runq == runq->ts_runq_active); assert(thread->ts_ctx.round == runq->ts_round); thread_sched_ts_dequeue(thread); thread->ts_ctx.round++; thread->ts_ctx.work -= thread->ts_ctx.weight; thread_sched_ts_enqueue(runq->ts_runq_expired, runq->ts_round + 1, thread); if (runq->ts_runq_active->nr_threads == 0) thread_runq_wakeup_balancer(runq); } static void thread_sched_ts_put_prev(struct thread_runq *runq, struct thread *thread) { struct thread_ts_runq *ts_runq; struct thread_ts_group *group; ts_runq = runq->ts_runq_active; group = &ts_runq->group_array[thread->ts_ctx.priority]; list_insert_tail(&group->threads, &thread->ts_ctx.group_node); if (thread->ts_ctx.work >= thread->ts_ctx.weight) thread_sched_ts_deactivate(runq, thread); } static int thread_sched_ts_ratio_exceeded(struct thread_ts_group *current, struct thread_ts_group *next) { unsigned long long a, b; #ifndef __LP64__ unsigned int ia, ib; if (likely((current->weight < 0x10000) && (next->weight < 0x10000))) { ia = (current->work + 1) * next->weight; ib = (next->work + 1) * current->weight; return ia > ib; } #endif /* __LP64__ */ a = ((unsigned long long)current->work + 1) * next->weight; b = ((unsigned long long)next->work + 1) * current->weight; return a > b; } static struct thread * thread_sched_ts_get_next(struct thread_runq *runq) { struct thread_ts_runq *ts_runq; struct thread_ts_group *group, *next; struct thread *thread; struct list *node; ts_runq = runq->ts_runq_active; if (ts_runq->nr_threads == 0) return NULL; group = ts_runq->current; node = list_next(&group->node); if (list_end(&ts_runq->groups, node)) { node = list_first(&ts_runq->groups); group = list_entry(node, struct thread_ts_group, node); } else { next = list_entry(node, struct thread_ts_group, node); if (thread_sched_ts_ratio_exceeded(group, next)) group = next; else { node = list_first(&ts_runq->groups); group = list_entry(node, struct thread_ts_group, node); } } ts_runq->current = group; node = list_first(&group->threads); thread = list_entry(node, struct thread, ts_ctx.group_node); list_remove(node); return thread; } static void thread_sched_ts_tick(struct thread_runq *runq, struct thread *thread) { struct thread_ts_runq *ts_runq; struct thread_ts_group *group; ts_runq = runq->ts_runq_active; ts_runq->work++; group = &ts_runq->group_array[thread->ts_ctx.priority]; group->work++; thread_set_flag(thread, THREAD_RESCHEDULE); thread->ts_ctx.work++; } static void thread_sched_ts_start_next_round(struct thread_runq *runq) { struct thread_ts_runq *tmp; long delta; tmp = runq->ts_runq_expired; runq->ts_runq_expired = runq->ts_runq_active; runq->ts_runq_active = tmp; if (runq->ts_runq_active->nr_threads != 0) { runq->ts_round++; delta = (long)(runq->ts_round - thread_ts_highest_round); if (delta > 0) thread_ts_highest_round = runq->ts_round; thread_sched_ts_restart(runq); } } /* * Check that a remote run queue satisfies the minimum migration requirements. */ static int thread_sched_ts_balance_eligible(struct thread_runq *runq, unsigned long highest_round) { unsigned int nr_threads; if (runq->ts_weight == 0) return 0; if ((runq->ts_round != highest_round) && (runq->ts_round != (highest_round - 1))) return 0; nr_threads = runq->ts_runq_active->nr_threads + runq->ts_runq_expired->nr_threads; if ((nr_threads == 0) || ((nr_threads == 1) && (runq->current->sched_class == THREAD_SCHED_CLASS_TS))) return 0; return 1; } /* * Try to find the most suitable run queue from which to pull threads. */ static struct thread_runq * thread_sched_ts_balance_scan(struct thread_runq *runq, unsigned long highest_round) { struct thread_runq *remote_runq, *tmp; unsigned long flags; int i, nr_runqs; nr_runqs = cpu_count(); remote_runq = NULL; thread_preempt_disable(); flags = cpu_intr_save(); bitmap_for_each(thread_active_runqs, nr_runqs, i) { tmp = &thread_runqs[i]; if (tmp == runq) continue; spinlock_lock(&tmp->lock); if (!thread_sched_ts_balance_eligible(tmp, highest_round)) { spinlock_unlock(&tmp->lock); continue; } if (remote_runq == NULL) { remote_runq = tmp; continue; } if (tmp->ts_weight > remote_runq->ts_weight) { spinlock_unlock(&remote_runq->lock); remote_runq = tmp; continue; } spinlock_unlock(&tmp->lock); } if (remote_runq != NULL) spinlock_unlock(&remote_runq->lock); cpu_intr_restore(flags); thread_preempt_enable(); return remote_runq; } static unsigned int thread_sched_ts_balance_pull(struct thread_runq *runq, struct thread_runq *remote_runq, struct thread_ts_runq *ts_runq, unsigned int nr_pulls) { struct thread *thread, *tmp; list_for_each_entry_safe(&ts_runq->threads, thread, tmp, ts_ctx.runq_node) { if (thread == remote_runq->current) continue; /* * The pinned counter is changed without explicit synchronization. * However, it can only be changed by its owning thread. As threads * currently running aren't considered for migration, the thread had * to be preempted and invoke the scheduler. Since balancer threads * acquire the run queue lock, there is strong ordering between * changing the pinned counter and setting the current thread of a * run queue. */ if (thread->pinned) continue; /* * Make sure at least one thread is pulled if possible. If one or more * thread has already been pulled, take weights into account. */ if ((nr_pulls != 0) && ((runq->ts_weight + thread->ts_ctx.weight) > (remote_runq->ts_weight - thread->ts_ctx.weight))) break; thread_runq_remove(remote_runq, thread); /* Don't discard the work already accounted for */ thread->ts_ctx.round = runq->ts_round; thread_runq_add(runq, thread); nr_pulls++; if (nr_pulls == THREAD_MAX_MIGRATIONS) break; } return nr_pulls; } static unsigned int thread_sched_ts_balance_migrate(struct thread_runq *runq, struct thread_runq *remote_runq, unsigned long highest_round) { unsigned int nr_pulls; nr_pulls = 0; if (!thread_sched_ts_balance_eligible(remote_runq, highest_round)) goto out; nr_pulls = thread_sched_ts_balance_pull(runq, remote_runq, remote_runq->ts_runq_active, 0); if (nr_pulls == THREAD_MAX_MIGRATIONS) goto out; /* * Threads in the expired queue of a processor in round highest are * actually in round highest + 1. */ if (remote_runq->ts_round != highest_round) nr_pulls = thread_sched_ts_balance_pull(runq, remote_runq, remote_runq->ts_runq_expired, nr_pulls); out: return nr_pulls; } /* * Inter-processor load balancing for time-sharing threads. * * Preemption must be disabled, and the local run queue must be locked when * calling this function. If balancing actually occurs, the lock will be * released and preemption enabled when needed. */ static void thread_sched_ts_balance(struct thread_runq *runq, unsigned long *flags) { struct thread_runq *remote_runq; unsigned long highest_round; unsigned int nr_migrations; int i, nr_runqs; /* * Grab the highest round now and only use the copy so the value is stable * during the balancing operation. */ highest_round = thread_ts_highest_round; if ((runq->ts_round != highest_round) && (runq->ts_runq_expired->nr_threads != 0)) goto no_migration; spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); remote_runq = thread_sched_ts_balance_scan(runq, highest_round); if (remote_runq != NULL) { thread_preempt_disable(); *flags = cpu_intr_save(); thread_runq_double_lock(runq, remote_runq); nr_migrations = thread_sched_ts_balance_migrate(runq, remote_runq, highest_round); spinlock_unlock(&remote_runq->lock); if (nr_migrations != 0) return; spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); } /* * The scan or the migration failed. As a fallback, make another, simpler * pass on every run queue, and stop as soon as at least one thread could * be successfully pulled. */ for (i = 0, nr_runqs = cpu_count(); i < nr_runqs; i++) { remote_runq = &thread_runqs[i]; if (remote_runq == runq) continue; thread_preempt_disable(); *flags = cpu_intr_save(); thread_runq_double_lock(runq, remote_runq); nr_migrations = thread_sched_ts_balance_migrate(runq, remote_runq, highest_round); spinlock_unlock(&remote_runq->lock); if (nr_migrations != 0) return; spinlock_unlock_intr_restore(&runq->lock, *flags); thread_preempt_enable(); } thread_preempt_disable(); spinlock_lock_intr_save(&runq->lock, flags); no_migration: /* * No thread could be migrated. Check the active run queue, as another * processor might have added threads while the balancer was running. * If the run queue is still empty, switch to the next round. The run * queue lock must remain held until the next scheduling decision to * prevent a remote balancer thread from stealing active threads. */ if (runq->ts_runq_active->nr_threads == 0) thread_sched_ts_start_next_round(runq); } static void thread_sched_idle_init_thread(struct thread *thread, unsigned short priority) { (void)thread; (void)priority; } static struct thread_runq * thread_sched_idle_select_runq(void) { panic("thread: idler threads cannot be awaken"); } static void __noreturn thread_sched_idle_panic(void) { panic("thread: only idle threads are allowed in the idle class"); } static void thread_sched_idle_add(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; thread_sched_idle_panic(); } static void thread_sched_idle_remove(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; thread_sched_idle_panic(); } static void thread_sched_idle_put_prev(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; } static struct thread * thread_sched_idle_get_next(struct thread_runq *runq) { return runq->idler; } static void thread_sched_idle_tick(struct thread_runq *runq, struct thread *thread) { (void)runq; (void)thread; } static void __init thread_bootstrap_common(unsigned int cpu) { struct thread *booter; booter = &thread_booters[cpu]; /* Initialize only what's needed during bootstrap */ booter->flags = 0; booter->preempt = 1; booter->sched_class = THREAD_SCHED_CLASS_IDLE; booter->task = kernel_task; thread_runq_init(&thread_runqs[cpu], booter); tcb_set_current(&booter->tcb); } void __init thread_bootstrap(void) { struct thread_sched_ops *ops; thread_policy_table[THREAD_SCHED_POLICY_FIFO] = THREAD_SCHED_CLASS_RT; thread_policy_table[THREAD_SCHED_POLICY_RR] = THREAD_SCHED_CLASS_RT; thread_policy_table[THREAD_SCHED_POLICY_TS] = THREAD_SCHED_CLASS_TS; thread_policy_table[THREAD_SCHED_POLICY_IDLE] = THREAD_SCHED_CLASS_IDLE; ops = &thread_sched_ops[THREAD_SCHED_CLASS_RT]; ops->init_thread = thread_sched_rt_init_thread; ops->select_runq = thread_sched_rt_select_runq; ops->add = thread_sched_rt_add; ops->remove = thread_sched_rt_remove; ops->put_prev = thread_sched_rt_put_prev; ops->get_next = thread_sched_rt_get_next; ops->tick = thread_sched_rt_tick; ops = &thread_sched_ops[THREAD_SCHED_CLASS_TS]; ops->init_thread = thread_sched_ts_init_thread; ops->select_runq = thread_sched_ts_select_runq; ops->add = thread_sched_ts_add; ops->remove = thread_sched_ts_remove; ops->put_prev = thread_sched_ts_put_prev; ops->get_next = thread_sched_ts_get_next; ops->tick = thread_sched_ts_tick; ops = &thread_sched_ops[THREAD_SCHED_CLASS_IDLE]; ops->init_thread = thread_sched_idle_init_thread; ops->select_runq = thread_sched_idle_select_runq; ops->add = thread_sched_idle_add; ops->remove = thread_sched_idle_remove; ops->put_prev = thread_sched_idle_put_prev; ops->get_next = thread_sched_idle_get_next; ops->tick = thread_sched_idle_tick; bitmap_zero(thread_active_runqs, MAX_CPUS); thread_ts_highest_round = THREAD_TS_INITIAL_ROUND; thread_bootstrap_common(0); } void __init thread_ap_bootstrap(void) { thread_bootstrap_common(cpu_id()); } static void thread_main(void) { struct thread *thread; assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); spinlock_unlock(&thread_runq_local()->lock); cpu_intr_enable(); thread_preempt_enable(); thread = thread_self(); thread->fn(thread->arg); thread_exit(); } static void thread_init_sched(struct thread *thread, unsigned short priority) { thread_sched_ops[thread->sched_class].init_thread(thread, priority); } static void thread_init(struct thread *thread, void *stack, const struct thread_attr *attr, void (*fn)(void *), void *arg) { const char *name; struct task *task; if (attr == NULL) attr = &thread_default_attr; task = (attr->task == NULL) ? thread_self()->task : attr->task; assert(task != NULL); name = (attr->name == NULL) ? task->name : attr->name; assert(name != NULL); assert(attr->policy < THREAD_NR_SCHED_POLICIES); /* * The expected interrupt, preemption and run queue lock state when * dispatching a thread is : * - interrupts disabled * - preemption disabled * - run queue locked * * Locking the run queue increases the preemption counter once more, * making its value 2. */ tcb_init(&thread->tcb, stack, thread_main); thread->flags = 0; thread->runq = NULL; thread->state = THREAD_SLEEPING; thread->preempt = 2; thread->pinned = 0; thread->sched_policy = attr->policy; thread->sched_class = thread_policy_table[attr->policy]; thread_init_sched(thread, attr->priority); thread->task = task; thread->stack = stack; strlcpy(thread->name, name, sizeof(thread->name)); thread->fn = fn; thread->arg = arg; task_add_thread(task, thread); } static struct thread_runq * thread_lock_runq(struct thread *thread, unsigned long *flags) { struct thread_runq *runq; assert(thread != thread_self()); for (;;) { runq = thread->runq; spinlock_lock_intr_save(&runq->lock, flags); if (runq == thread->runq) return runq; spinlock_unlock_intr_restore(&runq->lock, *flags); } } static void thread_unlock_runq(struct thread_runq *runq, unsigned long flags) { spinlock_unlock_intr_restore(&runq->lock, flags); } static void thread_destroy(struct thread *thread) { struct thread_runq *runq; unsigned long flags, state; do { runq = thread_lock_runq(thread, &flags); state = thread->state; thread_unlock_runq(runq, flags); } while (state != THREAD_DEAD); task_remove_thread(thread->task, thread); kmem_cache_free(&thread_stack_cache, thread->stack); kmem_cache_free(&thread_cache, thread); } static void thread_reap(void *arg) { struct thread_reap_waiter *tmp; struct list waiters; (void)arg; for (;;) { mutex_lock(&thread_reap_lock); while (list_empty(&thread_reap_list)) condition_wait(&thread_reap_condition, &thread_reap_lock); list_set_head(&waiters, &thread_reap_list); list_init(&thread_reap_list); mutex_unlock(&thread_reap_lock); while (!list_empty(&waiters)) { tmp = list_first_entry(&waiters, struct thread_reap_waiter, node); list_remove(&tmp->node); thread_destroy(tmp->thread); } } /* Never reached */ } static void __init thread_setup_reaper(void) { struct thread_attr attr; struct thread *thread; int error; mutex_init(&thread_reap_lock); condition_init(&thread_reap_condition); list_init(&thread_reap_list); attr.task = NULL; attr.name = "x15_thread_reap"; attr.policy = THREAD_SCHED_POLICY_TS; attr.priority = THREAD_SCHED_TS_PRIO_DEFAULT; error = thread_create(&thread, &attr, thread_reap, NULL); if (error) panic("thread: unable to create reaper thread"); } static void thread_balance_idle_tick(struct thread_runq *runq) { assert(runq->idle_balance_ticks != 0); /* * Interrupts can occur early, at a time the balancer thread hasn't been * created yet. */ if (runq->balancer == NULL) return; runq->idle_balance_ticks--; if (runq->idle_balance_ticks == 0) thread_runq_wakeup_balancer(runq); } static void thread_balance(void *arg) { struct thread_runq *runq; struct thread *self; unsigned long flags; runq = arg; self = runq->balancer; assert(self == runq->balancer); thread_preempt_disable(); spinlock_lock_intr_save(&runq->lock, &flags); for (;;) { runq->idle_balance_ticks = THREAD_IDLE_BALANCE_TICKS; self->state = THREAD_SLEEPING; runq = thread_runq_schedule(runq, self); assert(runq == arg); /* * This function may temporarily enable preemption and release the * run queue lock, but on return, the lock must remain held until this * balancer thread sleeps. */ thread_sched_ts_balance(runq, &flags); } } static void __init thread_setup_balancer(struct thread_runq *runq) { char name[THREAD_NAME_SIZE]; struct thread_runq *local_runq; struct thread_attr attr; struct thread *balancer; int error; snprintf(name, sizeof(name), "x15_thread_balance/%u", thread_runq_id(runq)); attr.task = NULL; attr.name = name; attr.policy = THREAD_SCHED_POLICY_RR; attr.priority = THREAD_SCHED_RT_PRIO_MIN; error = thread_create(&balancer, &attr, thread_balance, runq); if (error) panic("thread: unable to create balancer thread"); runq->balancer = balancer; /* * XXX Real-time threads are currently dispatched on the creator's run * queue. * * TODO Implement processor affinity and remove this kludge. */ local_runq = thread_runq_local(); if (runq != local_runq) { unsigned long flags; spinlock_lock_intr_save(&local_runq->lock, &flags); thread_runq_remove(local_runq, balancer); spinlock_unlock_intr_restore(&local_runq->lock, flags); spinlock_lock_intr_save(&runq->lock, &flags); thread_runq_add(runq, balancer); spinlock_unlock_intr_restore(&runq->lock, flags); } } static void thread_idle(void *arg) { struct thread *self; unsigned int cpu; self = thread_self(); cpu = thread_runq_id(arg); for (;;) { thread_preempt_disable(); llsync_unregister_cpu(cpu); for (;;) { cpu_intr_disable(); if (thread_test_flag(self, THREAD_RESCHEDULE)) { cpu_intr_enable(); break; } cpu_idle(); } llsync_register_cpu(cpu); thread_preempt_enable(); } } static void __init thread_setup_idler(struct thread_runq *runq) { char name[THREAD_NAME_SIZE]; struct thread_attr attr; struct thread *idler; void *stack; idler = kmem_cache_alloc(&thread_cache); if (idler == NULL) panic("thread: unable to allocate idler thread"); stack = kmem_cache_alloc(&thread_stack_cache); if (stack == NULL) panic("thread: unable to allocate idler thread stack"); snprintf(name, sizeof(name), "x15_thread_idle/%u", thread_runq_id(runq)); attr.task = kernel_task; attr.name = name; attr.policy = THREAD_SCHED_POLICY_IDLE; thread_init(idler, stack, &attr, thread_idle, runq); /* An idler thread needs special tuning */ idler->state = THREAD_RUNNING; idler->runq = runq; runq->idler = idler; } static void __init thread_setup_runq(struct thread_runq *runq) { thread_setup_balancer(runq); thread_setup_idler(runq); } void __init thread_setup(void) { size_t i; kmem_cache_init(&thread_cache, "thread", sizeof(struct thread), CPU_L1_SIZE, NULL, NULL, NULL, 0); kmem_cache_init(&thread_stack_cache, "thread_stack", STACK_SIZE, DATA_ALIGN, NULL, NULL, NULL, 0); thread_setup_reaper(); for (i = 0; i < cpu_count(); i++) thread_setup_runq(&thread_runqs[i]); } int thread_create(struct thread **threadp, const struct thread_attr *attr, void (*fn)(void *), void *arg) { struct thread *thread; void *stack; int error; thread = kmem_cache_alloc(&thread_cache); if (thread == NULL) { error = ERROR_NOMEM; goto error_thread; } stack = kmem_cache_alloc(&thread_stack_cache); if (stack == NULL) { error = ERROR_NOMEM; goto error_stack; } thread_init(thread, stack, attr, fn, arg); thread_wakeup(thread); *threadp = thread; return 0; error_stack: kmem_cache_free(&thread_cache, thread); error_thread: return error; } void thread_exit(void) { struct thread_reap_waiter waiter; struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); waiter.thread = thread; mutex_lock(&thread_reap_lock); list_insert_tail(&thread_reap_list, &waiter.node); condition_signal(&thread_reap_condition); /* * Disable preemption before releasing the mutex to make sure the current * thread becomes dead as soon as possible. This is important because the * reaper thread actively polls the thread state before destroying it. */ thread_preempt_disable(); mutex_unlock(&thread_reap_lock); runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); thread->state = THREAD_DEAD; runq = thread_runq_schedule(runq, thread); panic("thread: dead thread running"); } void thread_sleep(struct spinlock *interlock) { struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); thread_preempt_disable(); runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); spinlock_unlock(interlock); thread->state = THREAD_SLEEPING; runq = thread_runq_schedule(runq, thread); assert(thread->state == THREAD_RUNNING); spinlock_unlock_intr_restore(&runq->lock, flags); thread_preempt_enable(); spinlock_lock(interlock); } void thread_wakeup(struct thread *thread) { struct thread_runq *runq; unsigned long flags; /* * There is at most one reference on threads that were never dispatched, * in which case there is no need to lock anything. */ if (thread->runq == NULL) { assert(thread->state != THREAD_RUNNING); thread->state = THREAD_RUNNING; } else { /* * If another wakeup was attempted right before this one, the thread * may currently be pushed on a remote run queue, and the run queue * being locked here is actually the previous one. The run queue * pointer may be modified concurrently, now being protected by the * target run queue. This isn't a problem since the thread state has * already been updated, making this attempt stop early. In addition, * locking semantics guarantee that, if the thread as seen by this * attempt isn't running, its run queue is up to date. */ runq = thread_lock_runq(thread, &flags); if (thread->state == THREAD_RUNNING) { thread_unlock_runq(runq, flags); return; } thread->state = THREAD_RUNNING; thread_unlock_runq(runq, flags); } thread_preempt_disable(); flags = cpu_intr_save(); /* The returned run queue is locked */ runq = thread_sched_ops[thread->sched_class].select_runq(); thread_runq_wakeup(runq, thread); spinlock_unlock(&runq->lock); cpu_intr_restore(flags); thread_preempt_enable(); } void __init thread_run(void) { struct thread_runq *runq; struct thread *thread; assert(cpu_intr_enabled()); runq = thread_runq_local(); llsync_register_cpu(thread_runq_id(runq)); thread = thread_self(); assert(thread == runq->current); assert(thread->preempt == 1); cpu_intr_disable(); spinlock_lock(&runq->lock); thread = thread_runq_get_next(thread_runq_local()); if (thread->task != kernel_task) pmap_load(thread->task->map->pmap); tcb_load(&thread->tcb); } void thread_reschedule(void) { struct thread_runq *runq; struct thread *thread; unsigned long flags; thread = thread_self(); if (!thread_test_flag(thread, THREAD_RESCHEDULE) || !thread_preempt_enabled()) return; do { thread_preempt_disable(); runq = thread_runq_local(); spinlock_lock_intr_save(&runq->lock, &flags); runq = thread_runq_schedule(runq, thread); spinlock_unlock_intr_restore(&runq->lock, flags); thread_preempt_enable_no_resched(); } while (thread_test_flag(thread, THREAD_RESCHEDULE)); } void thread_tick(void) { struct thread_runq *runq; struct thread *thread; assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); runq = thread_runq_local(); llsync_commit_checkpoint(thread_runq_id(runq)); thread = thread_self(); spinlock_lock(&runq->lock); if (runq->nr_threads == 0) thread_balance_idle_tick(runq); thread_sched_ops[thread->sched_class].tick(runq, thread); spinlock_unlock(&runq->lock); }