kern/{thread,turnstile}: implement priority inheritance

The new turnstile module provides priority propagation capable sleep queues, tightly coupled with the scheduler, and can be used to implement synchronization facilities with priority inheritance.
author: Richard Braun <rbraun@sceen.net> 2017-03-04 15:51:04 +0100
committer: Richard Braun <rbraun@sceen.net> 2017-03-04 16:47:59 +0100
commit: ef9426483f2f388c4874c6b12e0800645c3dbce4 (patch)
tree: 0cc9eb58a28d8cf0bb02c8b38cced1a4f37ac73f /kern
parent: 6fecd5cef7a2f549b4d81053a3b80365ed7828f5 (diff)
8 files changed, 1484 insertions, 121 deletions
diff --git a/kern/kernel.c b/kern/kernel.c
index 9b8d4eec..cd3d3e3b 100644
--- a/kern/kernel.c
+++ b/kern/kernel.c
@@ -24,6 +24,7 @@
 #include <kern/sref.h>
 #include <kern/task.h>
 #include <kern/thread.h>
+#include <kern/turnstile.h>
 #include <kern/work.h>
 #include <kern/xcall.h>
 #include <machine/cpu.h>
@@ -43,6 +44,7 @@ kernel_main(void)
     xcall_setup();
     task_setup();
     sleepq_setup();
+    turnstile_setup();
     thread_setup();
     work_setup();
     llsync_setup();
diff --git a/kern/task.c b/kern/task.c
index afc2407a..d7d91c17 100644
--- a/kern/task.c
+++ b/kern/task.c
@@ -150,8 +150,8 @@ task_info(struct task *task)
                thread_state_to_chr(thread),
                thread_wchan_desc(thread),
                (unsigned long)thread_wchan_addr(thread),
-               thread_sched_class_to_str(thread_sched_class(thread)),
-               thread_priority(thread),
+               thread_sched_class_to_str(thread_user_sched_class(thread)),
+               thread_user_priority(thread),
                thread->name);
     }
 
diff --git a/kern/thread.c b/kern/thread.c
index 069881c0..3eccebda 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -105,6 +105,7 @@
 #include <kern/sref.h>
 #include <kern/task.h>
 #include <kern/thread.h>
+#include <kern/turnstile.h>
 #include <kern/work.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
@@ -261,13 +262,13 @@ struct thread_runq {
  * Operations of a scheduling class.
  */
 struct thread_sched_ops {
-    void (*init_sched)(struct thread *thread, unsigned short priority);
     struct thread_runq * (*select_runq)(struct thread *thread);
     void (*add)(struct thread_runq *runq, struct thread *thread);
     void (*remove)(struct thread_runq *runq, struct thread *thread);
     void (*put_prev)(struct thread_runq *runq, struct thread *thread);
     struct thread * (*get_next)(struct thread_runq *runq);
-    void (*set_priority)(struct thread *thread, unsigned short priority);
+    void (*reset_priority)(struct thread *thread, unsigned short priority);
+    void (*update_priority)(struct thread *thread, unsigned short priority);
     unsigned int (*get_global_priority)(unsigned short priority);
     void (*set_next)(struct thread_runq *runq, struct thread *thread);
     void (*tick)(struct thread_runq *runq, struct thread *thread);
@@ -343,6 +344,13 @@ struct thread_zombie {
     struct thread *thread;
 };
 
+static unsigned char
+thread_policy_to_class(unsigned char policy)
+{
+    assert(policy < ARRAY_SIZE(thread_policy_table));
+    return thread_policy_table[policy];
+}
+
 static void
 thread_set_wchan(struct thread *thread, const void *wchan_addr,
                  const char *wchan_desc)
@@ -358,15 +366,24 @@ thread_clear_wchan(struct thread *thread)
 }
 
 static const struct thread_sched_ops *
-thread_get_sched_ops(const struct thread *thread)
+thread_get_sched_ops(unsigned char sched_class)
 {
-    unsigned char sched_class;
-
-    sched_class = thread_sched_class(thread);
     assert(sched_class < ARRAY_SIZE(thread_sched_ops));
     return &thread_sched_ops[sched_class];
 }
 
+static const struct thread_sched_ops *
+thread_get_user_sched_ops(const struct thread *thread)
+{
+    return thread_get_sched_ops(thread_user_sched_class(thread));
+}
+
+static const struct thread_sched_ops *
+thread_get_real_sched_ops(const struct thread *thread)
+{
+    return thread_get_sched_ops(thread_real_sched_class(thread));
+}
+
 static void __init
 thread_runq_init_rt(struct thread_runq *runq)
 {
@@ -458,7 +475,7 @@ thread_runq_add(struct thread_runq *runq, struct thread *thread)
     spinlock_assert_locked(&runq->lock);
     assert(!thread->in_runq);
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
     ops->add(runq, thread);
 
     if (runq->nr_threads == 0) {
@@ -467,7 +484,8 @@ thread_runq_add(struct thread_runq *runq, struct thread *thread)
 
     runq->nr_threads++;
 
-    if (thread_sched_class(thread) < thread_sched_class(runq->current)) {
+    if (thread_real_sched_class(thread)
+        < thread_real_sched_class(runq->current)) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 
@@ -490,7 +508,7 @@ thread_runq_remove(struct thread_runq *runq, struct thread *thread)
         cpumap_set_atomic(&thread_idle_runqs, thread_runq_cpu(runq));
     }
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
     ops->remove(runq, thread);
 
     thread->in_runq = false;
@@ -504,7 +522,7 @@ thread_runq_put_prev(struct thread_runq *runq, struct thread *thread)
     assert(!cpu_intr_enabled());
     spinlock_assert_locked(&runq->lock);
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->put_prev != NULL) {
         ops->put_prev(runq, thread);
@@ -538,7 +556,7 @@ thread_runq_set_next(struct thread_runq *runq, struct thread *thread)
 {
     const struct thread_sched_ops *ops;
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->set_next != NULL) {
         ops->set_next(runq, thread);
@@ -660,13 +678,6 @@ thread_runq_double_lock(struct thread_runq *a, struct thread_runq *b)
     }
 }
 
-static void
-thread_sched_rt_init_sched(struct thread *thread, unsigned short priority)
-{
-    assert(priority <= THREAD_SCHED_RT_PRIO_MAX);
-    thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
-}
-
 static struct thread_runq *
 thread_sched_rt_select_runq(struct thread *thread)
 {
@@ -693,15 +704,17 @@ thread_sched_rt_add(struct thread_runq *runq, struct thread *thread)
     struct list *threads;
 
     rt_runq = &runq->rt_runq;
-    threads = &rt_runq->threads[thread_priority(thread)];
+    threads = &rt_runq->threads[thread_real_priority(thread)];
     list_insert_tail(threads, &thread->rt_data.node);
 
     if (list_singular(threads)) {
-        rt_runq->bitmap |= (1ULL << thread_priority(thread));
+        rt_runq->bitmap |= (1ULL << thread_real_priority(thread));
     }
 
-    if ((thread_sched_class(thread) == thread_sched_class(runq->current))
-        && (thread_priority(thread) > thread_priority(runq->current))) {
+    if ((thread_real_sched_class(thread)
+         == thread_real_sched_class(runq->current))
+        && (thread_real_priority(thread)
+            > thread_real_priority(runq->current))) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 }
@@ -713,11 +726,11 @@ thread_sched_rt_remove(struct thread_runq *runq, struct thread *thread)
     struct list *threads;
 
     rt_runq = &runq->rt_runq;
-    threads = &rt_runq->threads[thread_priority(thread)];
+    threads = &rt_runq->threads[thread_real_priority(thread)];
     list_remove(&thread->rt_data.node);
 
     if (list_empty(threads)) {
-        rt_runq->bitmap &= ~(1ULL << thread_priority(thread));
+        rt_runq->bitmap &= ~(1ULL << thread_real_priority(thread));
     }
 }
 
@@ -749,6 +762,13 @@ thread_sched_rt_get_next(struct thread_runq *runq)
     return thread;
 }
 
+static void
+thread_sched_rt_reset_priority(struct thread *thread, unsigned short priority)
+{
+    assert(priority <= THREAD_SCHED_RT_PRIO_MAX);
+    thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
+}
+
 static unsigned int
 thread_sched_rt_get_global_priority(unsigned short priority)
 {
@@ -766,7 +786,7 @@ thread_sched_rt_tick(struct thread_runq *runq, struct thread *thread)
 {
     (void)runq;
 
-    if (thread_sched_policy(thread) != THREAD_SCHED_POLICY_RR) {
+    if (thread_real_sched_policy(thread) != THREAD_SCHED_POLICY_RR) {
         return;
     }
 
@@ -786,16 +806,6 @@ thread_sched_fs_prio2weight(unsigned short priority)
     return ((priority + 1) * THREAD_FS_ROUND_SLICE_BASE);
 }
 
-static void
-thread_sched_fs_init_sched(struct thread *thread, unsigned short priority)
-{
-    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
-    thread->fs_data.fs_runq = NULL;
-    thread->fs_data.round = 0;
-    thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
-    thread->fs_data.work = 0;
-}
-
 static struct thread_runq *
 thread_sched_fs_select_runq(struct thread *thread)
 {
@@ -897,7 +907,7 @@ thread_sched_fs_enqueue(struct thread_fs_runq *fs_runq, unsigned long round,
     assert(thread->fs_data.fs_runq == NULL);
     assert(thread->fs_data.work <= thread->fs_data.weight);
 
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     group_weight = group->weight + thread->fs_data.weight;
     total_weight = fs_runq->weight + thread->fs_data.weight;
     node = (group->weight == 0)
@@ -973,7 +983,7 @@ thread_sched_fs_restart(struct thread_runq *runq)
     assert(node != NULL);
     fs_runq->current = list_entry(node, struct thread_fs_group, node);
 
-    if (thread_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) {
+    if (thread_real_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 }
@@ -1009,7 +1019,7 @@ thread_sched_fs_dequeue(struct thread *thread)
     assert(thread->fs_data.fs_runq != NULL);
 
     fs_runq = thread->fs_data.fs_runq;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
 
     thread->fs_data.fs_runq = NULL;
     list_remove(&thread->fs_data.runq_node);
@@ -1084,7 +1094,7 @@ thread_sched_fs_put_prev(struct thread_runq *runq, struct thread *thread)
     struct thread_fs_group *group;
 
     fs_runq = runq->fs_runq_active;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     list_insert_tail(&group->threads, &thread->fs_data.group_node);
 
     if (thread->fs_data.work >= thread->fs_data.weight) {
@@ -1152,8 +1162,19 @@ thread_sched_fs_get_next(struct thread_runq *runq)
 }
 
 static void
-thread_sched_fs_set_priority(struct thread *thread, unsigned short priority)
+thread_sched_fs_reset_priority(struct thread *thread, unsigned short priority)
+{
+    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
+    thread->fs_data.fs_runq = NULL;
+    thread->fs_data.round = 0;
+    thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
+    thread->fs_data.work = 0;
+}
+
+static void
+thread_sched_fs_update_priority(struct thread *thread, unsigned short priority)
 {
+    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
     thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
 
     if (thread->fs_data.work >= thread->fs_data.weight) {
@@ -1184,7 +1205,7 @@ thread_sched_fs_tick(struct thread_runq *runq, struct thread *thread)
 
     fs_runq = runq->fs_runq_active;
     fs_runq->work++;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     group->work++;
     thread_set_flag(thread, THREAD_YIELD);
     thread->fs_data.work++;
@@ -1235,7 +1256,8 @@ thread_sched_fs_balance_eligible(struct thread_runq *runq,
 
     if ((nr_threads == 0)
         || ((nr_threads == 1)
-            && (thread_sched_class(runq->current) == THREAD_SCHED_CLASS_FS))) {
+            && (thread_real_sched_class(runq->current)
+                == THREAD_SCHED_CLASS_FS))) {
         return 0;
     }
 
@@ -1525,39 +1547,40 @@ thread_sched_idle_get_global_priority(unsigned short priority)
     return THREAD_SCHED_GLOBAL_PRIO_IDLE;
 }
 
-static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] = {
+static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES]
+    = {
     [THREAD_SCHED_CLASS_RT] = {
-        .init_sched = thread_sched_rt_init_sched,
         .select_runq = thread_sched_rt_select_runq,
         .add = thread_sched_rt_add,
         .remove = thread_sched_rt_remove,
         .put_prev = thread_sched_rt_put_prev,
         .get_next = thread_sched_rt_get_next,
-        .set_priority = NULL,
+        .reset_priority = thread_sched_rt_reset_priority,
+        .update_priority = NULL,
         .get_global_priority = thread_sched_rt_get_global_priority,
         .set_next = thread_sched_rt_set_next,
         .tick = thread_sched_rt_tick,
     },
     [THREAD_SCHED_CLASS_FS] = {
-        .init_sched = thread_sched_fs_init_sched,
         .select_runq = thread_sched_fs_select_runq,
         .add = thread_sched_fs_add,
         .remove = thread_sched_fs_remove,
         .put_prev = thread_sched_fs_put_prev,
         .get_next = thread_sched_fs_get_next,
-        .set_priority = thread_sched_fs_set_priority,
+        .reset_priority = thread_sched_fs_reset_priority,
+        .update_priority = thread_sched_fs_update_priority,
         .get_global_priority = thread_sched_fs_get_global_priority,
         .set_next = thread_sched_fs_set_next,
         .tick = thread_sched_fs_tick,
     },
     [THREAD_SCHED_CLASS_IDLE] = {
-        .init_sched = NULL,
         .select_runq = thread_sched_idle_select_runq,
         .add = thread_sched_idle_add,
         .remove = thread_sched_idle_remove,
         .put_prev = NULL,
         .get_next = thread_sched_idle_get_next,
-        .set_priority = NULL,
+        .reset_priority = NULL,
+        .update_priority = NULL,
         .get_global_priority = thread_sched_idle_get_global_priority,
         .set_next = NULL,
         .tick = NULL,
@@ -1565,30 +1588,95 @@ static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] =
 };
 
 static void
-thread_set_sched_policy(struct thread *thread, unsigned char sched_policy)
+thread_set_user_sched_policy(struct thread *thread, unsigned char sched_policy)
+{
+    thread->user_sched_data.sched_policy = sched_policy;
+}
+
+static void
+thread_set_user_sched_class(struct thread *thread, unsigned char sched_class)
+{
+    thread->user_sched_data.sched_class = sched_class;
+}
+
+static void
+thread_set_user_priority(struct thread *thread, unsigned short priority)
+{
+    const struct thread_sched_ops *ops;
+
+    ops = thread_get_user_sched_ops(thread);
+
+    thread->user_sched_data.priority = priority;
+    thread->user_sched_data.global_priority
+        = ops->get_global_priority(priority);
+}
+
+static void
+thread_update_user_priority(struct thread *thread, unsigned short priority)
+{
+    thread_set_user_priority(thread, priority);
+}
+
+static void
+thread_set_real_sched_policy(struct thread *thread, unsigned char sched_policy)
 {
-    thread->sched_data.sched_policy = sched_policy;
+    thread->real_sched_data.sched_policy = sched_policy;
 }
 
 static void
-thread_set_sched_class(struct thread *thread, unsigned char sched_class)
+thread_set_real_sched_class(struct thread *thread, unsigned char sched_class)
 {
-    thread->sched_data.sched_class = sched_class;
+    thread->real_sched_data.sched_class = sched_class;
 }
 
 static void
-thread_set_priority(struct thread *thread, unsigned short priority)
+thread_set_real_priority(struct thread *thread, unsigned short priority)
 {
     const struct thread_sched_ops *ops;
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
-    if (ops->set_priority != NULL) {
-        ops->set_priority(thread, priority);
+    thread->real_sched_data.priority = priority;
+    thread->real_sched_data.global_priority
+        = ops->get_global_priority(priority);
+
+    if (ops->reset_priority != NULL) {
+        ops->reset_priority(thread, priority);
+    }
+}
+
+static void
+thread_update_real_priority(struct thread *thread, unsigned short priority)
+{
+    const struct thread_sched_ops *ops;
+
+    ops = thread_get_real_sched_ops(thread);
+
+    thread->real_sched_data.priority = priority;
+    thread->real_sched_data.global_priority
+        = ops->get_global_priority(priority);
+
+    if (ops->update_priority != NULL) {
+        ops->update_priority(thread, priority);
     }
+}
+
+static void
+thread_reset_real_priority(struct thread *thread)
+{
+    const struct thread_sched_ops *ops;
+    struct thread_sched_data *user, *real;
 
-    thread->sched_data.priority = priority;
-    thread->sched_data.global_priority = ops->get_global_priority(priority);
+    user = &thread->user_sched_data;
+    real = &thread->real_sched_data;
+    *real = *user;
+    thread->boosted = false;
+
+    ops = thread_get_user_sched_ops(thread);
+
+    if (ops->reset_priority != NULL) {
+        ops->reset_priority(thread, real->priority);
+    }
 }
 
 static void __init
@@ -1604,9 +1692,10 @@ thread_bootstrap_common(unsigned int cpu)
     booter->flags = 0;
     booter->preempt = 1;
     cpumap_fill(&booter->cpumap);
-    thread_set_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
-    thread_set_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
-    thread_set_priority(booter, 0);
+    thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
+    thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
+    thread_set_user_priority(booter, 0);
+    thread_reset_real_priority(booter);
     memset(booter->tsd, 0, sizeof(booter->tsd));
     booter->task = kernel_task;
     thread_runq_init(percpu_ptr(thread_runq, cpu), cpu, booter);
@@ -1620,8 +1709,8 @@ thread_bootstrap(void)
 
     thread_fs_highest_round = THREAD_FS_INITIAL_ROUND;
 
-    thread_bootstrap_common(0);
     tcb_set_current(&thread_booters[0].tcb);
+    thread_bootstrap_common(0);
 }
 
 void __init
@@ -1677,23 +1766,9 @@ thread_destroy_tsd(struct thread *thread)
     }
 }
 
-static void
-thread_init_sched(struct thread *thread, unsigned short priority)
-{
-    const struct thread_sched_ops *ops;
-
-    ops = thread_get_sched_ops(thread);
-
-    if (ops->init_sched != NULL) {
-        ops->init_sched(thread, priority);
-    }
-
-    thread->sched_data.priority = priority;
-    thread->sched_data.global_priority = ops->get_global_priority(priority);
-}
-
 static int
-thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
+thread_init(struct thread *thread, void *stack,
+            const struct thread_attr *attr,
             void (*fn)(void *), void *arg)
 {
     struct thread *caller;
@@ -1720,13 +1795,23 @@ thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
         goto error_sleepq;
     }
 
+    thread->priv_turnstile = turnstile_create();
+
+    if (thread->priv_turnstile == NULL) {
+        error = ERROR_NOMEM;
+        goto error_turnstile;
+    }
+
+    turnstile_td_init(&thread->turnstile_td);
+    thread->propagate_priority = false;
     thread->preempt = THREAD_SUSPEND_PREEMPT_LEVEL;
     thread->pinned = 0;
     thread->llsync_read = 0;
     cpumap_copy(&thread->cpumap, cpumap);
-    thread_set_sched_policy(thread, attr->policy);
-    thread_set_sched_class(thread, thread_policy_table[attr->policy]);
-    thread_init_sched(thread, attr->priority);
+    thread_set_user_sched_policy(thread, attr->policy);
+    thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
+    thread_set_user_priority(thread, attr->priority);
+    thread_reset_real_priority(thread);
     memset(thread->tsd, 0, sizeof(thread->tsd));
     mutex_init(&thread->join_lock);
     condition_init(&thread->join_cond);
@@ -1753,6 +1838,8 @@ thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
 
 error_tcb:
     thread_destroy_tsd(thread);
+    turnstile_destroy(thread->priv_turnstile);
+error_turnstile:
     sleepq_destroy(thread->priv_sleepq);
 error_sleepq:
     return error;
@@ -1799,6 +1886,7 @@ thread_destroy(struct thread *thread)
     thread_destroy_tsd(thread);
     task_remove_thread(thread->task, thread);
 
+    turnstile_destroy(thread->priv_turnstile);
     sleepq_destroy(thread->priv_sleepq);
     kmem_cache_free(&thread_stack_cache, thread->stack);
     kmem_cache_free(&thread_cache, thread);
@@ -2224,7 +2312,7 @@ thread_wakeup(struct thread *thread)
         thread->state = THREAD_RUNNING;
     } else {
         /*
-         * If another wakeup was attempted right before this one, the thread
+         * If another wake-up was attempted right before this one, the thread
          * may currently be pushed on a remote run queue, and the run queue
          * being locked here is actually the previous one. The run queue
          * pointer may be modified concurrently, now being protected by the
@@ -2249,7 +2337,7 @@ thread_wakeup(struct thread *thread)
     cpu_intr_save(&flags);
 
     if (!thread->pinned) {
-        runq = thread_sched_ops[thread_sched_class(thread)].select_runq(thread);
+        runq = thread_get_real_sched_ops(thread)->select_runq(thread);
     } else {
         runq = thread->runq;
         spinlock_lock(&runq->lock);
@@ -2341,7 +2429,7 @@ thread_tick_intr(void)
         thread_balance_idle_tick(runq);
     }
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->tick != NULL) {
         ops->tick(runq, thread);
@@ -2350,6 +2438,7 @@ thread_tick_intr(void)
     spinlock_unlock(&runq->lock);
 }
 
+/* TODO Move outside */
 char
 thread_state_to_chr(const struct thread *thread)
 {
@@ -2365,10 +2454,11 @@ thread_state_to_chr(const struct thread *thread)
     }
 }
 
+/* TODO Move outside */
 const char *
-thread_sched_class_to_str(unsigned char sched_policy)
+thread_sched_class_to_str(unsigned char sched_class)
 {
-    switch (sched_policy) {
+    switch (sched_class) {
     case THREAD_SCHED_CLASS_RT:
         return "rt";
     case THREAD_SCHED_CLASS_FS:
@@ -2385,13 +2475,96 @@ thread_setscheduler(struct thread *thread, unsigned char policy,
                     unsigned short priority)
 {
     struct thread_runq *runq;
+    struct turnstile_td *td;
+    unsigned long flags;
+    bool requeue, current, update;
+
+    td = thread_turnstile_td(thread);
+
+    turnstile_td_lock(td);
+    runq = thread_lock_runq(thread, &flags);
+
+    if ((thread_user_sched_policy(thread) == policy)
+        && (thread_user_priority(thread) == priority)) {
+        goto out;
+    }
+
+    requeue = thread->in_runq;
+
+    if (!requeue) {
+        current = false;
+    } else {
+        if (thread != runq->current) {
+            current = false;
+        } else {
+            thread_runq_put_prev(runq, thread);
+            current = true;
+        }
+
+        thread_runq_remove(runq, thread);
+    }
+
+    if (thread_user_sched_policy(thread) == policy) {
+        thread_update_user_priority(thread, priority);
+        update = true;
+    } else {
+        thread_set_user_sched_policy(thread, policy);
+        thread_set_user_sched_class(thread, thread_policy_to_class(policy));
+        thread_set_user_priority(thread, priority);
+        update = false;
+    }
+
+    if (thread->boosted) {
+        if (thread_user_global_priority(thread)
+            >= thread_real_global_priority(thread)) {
+            thread_reset_real_priority(thread);
+        }
+    } else {
+        if (update) {
+            thread_update_real_priority(thread, priority);
+        } else {
+            thread_set_real_sched_policy(thread, policy);
+            thread_set_real_sched_class(thread, thread_policy_to_class(policy));
+            thread_set_real_priority(thread, priority);
+        }
+    }
+
+    if (requeue) {
+        thread_runq_add(runq, thread);
+
+        if (current) {
+            thread_runq_set_next(runq, thread);
+        }
+    }
+
+out:
+    thread_unlock_runq(runq, flags);
+    turnstile_td_unlock(td);
+
+    turnstile_td_propagate_priority(td);
+}
+
+void
+thread_pi_setscheduler(struct thread *thread, unsigned char policy,
+                       unsigned short priority)
+{
+    const struct thread_sched_ops *ops;
+    struct thread_runq *runq;
+    struct turnstile_td *td;
+    unsigned int global_priority;
     unsigned long flags;
     bool requeue, current;
 
+    td = thread_turnstile_td(thread);
+    turnstile_td_assert_lock(td);
+
+    ops = thread_get_sched_ops(thread_policy_to_class(policy));
+    global_priority = ops->get_global_priority(priority);
+
     runq = thread_lock_runq(thread, &flags);
 
-    if ((thread_sched_policy(thread) == policy)
-        && (thread_priority(thread) == priority)) {
+    if ((thread_real_sched_policy(thread) == policy)
+        && (thread_real_priority(thread) == priority)) {
         goto out;
     }
 
@@ -2410,13 +2583,18 @@ thread_setscheduler(struct thread *thread, unsigned char policy,
         thread_runq_remove(runq, thread);
     }
 
-    if (thread_sched_policy(thread) == policy) {
-        thread_set_priority(thread, priority);
+    if (global_priority <= thread_user_global_priority(thread)) {
+        thread_reset_real_priority(thread);
     } else {
-        thread_set_sched_policy(thread, policy);
-        assert(policy < ARRAY_SIZE(thread_policy_table));
-        thread_set_sched_class(thread, thread_policy_table[policy]);
-        thread_init_sched(thread, priority);
+        if (thread_real_sched_policy(thread) == policy) {
+            thread_update_real_priority(thread, priority);
+        } else {
+            thread_set_real_sched_policy(thread, policy);
+            thread_set_real_sched_class(thread, thread_policy_to_class(policy));
+            thread_set_real_priority(thread, priority);
+        }
+
+        thread->boosted = true;
     }
 
     if (requeue) {
@@ -2432,6 +2610,28 @@ out:
 }
 
 void
+thread_propagate_priority(void)
+{
+    struct thread *thread;
+
+    /*
+     * Although it's possible to propagate priority with preemption
+     * disabled, the operation can be too expensive to allow it.
+     */
+    if (!thread_preempt_enabled()) {
+        thread_set_priority_propagation_needed();
+        return;
+    }
+
+    thread = thread_self();
+
+    /* Clear before propagation to avoid infinite recursion */
+    thread->propagate_priority = false;
+
+    turnstile_td_propagate_priority(thread_turnstile_td(thread));
+}
+
+void
 thread_key_create(unsigned int *keyp, thread_dtor_fn_t dtor)
 {
     unsigned int key;
diff --git a/kern/thread.h b/kern/thread.h
index 1542e7ae..15babbfc 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -33,30 +33,23 @@
 #ifndef _KERN_THREAD_H
 #define _KERN_THREAD_H
 
+#include <stdbool.h>
+#include <stddef.h>
+
 #include <kern/assert.h>
 #include <kern/cpumap.h>
 #include <kern/macros.h>
+#include <kern/spinlock_types.h>
+#include <kern/turnstile_types.h>
 #include <machine/atomic.h>
 #include <machine/tcb.h>
 
 /*
- * Forward declaration
- */
-struct spinlock;
-
-/*
  * Thread structure.
  */
 struct thread;
 
 /*
- * Thread name buffer size.
- */
-#define THREAD_NAME_SIZE 32
-
-/*
- * Common scheduling data.
- *
  * The global priority of a thread is meant to be compared against
  * another global priority to determine which thread has higher priority.
  */
@@ -67,6 +60,11 @@ struct thread_sched_data {
     unsigned int global_priority;
 };
 
+/*
+ * Thread name buffer size.
+ */
+#define THREAD_NAME_SIZE 32
+
 #include <kern/thread_i.h>
 
 #define THREAD_KERNEL_PREFIX PACKAGE "_"
@@ -261,6 +259,15 @@ void thread_tick_intr(void);
 void thread_setscheduler(struct thread *thread, unsigned char policy,
                          unsigned short priority);
 
+/*
+ * Variant used for priority inheritance.
+ *
+ * The caller must hold the turnstile thread data lock and no turnstile
+ * locks when calling this function.
+ */
+void thread_pi_setscheduler(struct thread *thread, unsigned char policy,
+                            unsigned short priority);
+
 static inline void
 thread_ref(struct thread *thread)
 {
@@ -301,33 +308,72 @@ thread_wchan_desc(const struct thread *thread)
 char thread_state_to_chr(const struct thread *thread);
 
 static inline const struct thread_sched_data *
-thread_get_sched_data(const struct thread *thread)
+thread_get_user_sched_data(const struct thread *thread)
+{
+    return &thread->user_sched_data;
+}
+
+static inline const struct thread_sched_data *
+thread_get_real_sched_data(const struct thread *thread)
+{
+    return &thread->real_sched_data;
+}
+
+/*
+ * If the caller requires the scheduling data to be stable, it
+ * must lock one of the following objects :
+ *  - the containing run queue
+ *  - the per-thread turnstile data (turnstile_td)
+ *
+ * Both are locked when scheduling data are updated.
+ */
+
+static inline unsigned char
+thread_user_sched_policy(const struct thread *thread)
+{
+    return thread_get_user_sched_data(thread)->sched_policy;
+}
+
+static inline unsigned char
+thread_user_sched_class(const struct thread *thread)
 {
-    return &thread->sched_data;
+    return thread_get_user_sched_data(thread)->sched_class;
+}
+
+static inline unsigned short
+thread_user_priority(const struct thread *thread)
+{
+    return thread_get_user_sched_data(thread)->priority;
+}
+
+static inline unsigned int
+thread_user_global_priority(const struct thread *thread)
+{
+    return thread_get_user_sched_data(thread)->global_priority;
 }
 
 static inline unsigned char
-thread_sched_policy(const struct thread *thread)
+thread_real_sched_policy(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->sched_policy;
+    return thread_get_real_sched_data(thread)->sched_policy;
 }
 
 static inline unsigned char
-thread_sched_class(const struct thread *thread)
+thread_real_sched_class(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->sched_class;
+    return thread_get_real_sched_data(thread)->sched_class;
 }
 
 static inline unsigned short
-thread_priority(const struct thread *thread)
+thread_real_priority(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->priority;
+    return thread_get_real_sched_data(thread)->priority;
 }
 
 static inline unsigned int
-thread_global_priority(const struct thread *thread)
+thread_real_global_priority(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->global_priority;
+    return thread_get_real_sched_data(thread)->global_priority;
 }
 
 /*
@@ -390,6 +436,53 @@ thread_sleepq_return(struct sleepq *sleepq)
 }
 
 /*
+ * Turnstile lending functions.
+ */
+
+static inline struct turnstile *
+thread_turnstile_lend(void)
+{
+    struct turnstile *turnstile;
+
+    turnstile = thread_self()->priv_turnstile;
+    assert(turnstile != NULL);
+    thread_self()->priv_turnstile = NULL;
+    return turnstile;
+}
+
+static inline void
+thread_turnstile_return(struct turnstile *turnstile)
+{
+    assert(turnstile != NULL);
+    assert(thread_self()->priv_turnstile == NULL);
+    thread_self()->priv_turnstile = turnstile;
+}
+
+static inline struct turnstile_td *
+thread_turnstile_td(struct thread *thread)
+{
+    return &thread->turnstile_td;
+}
+
+/*
+ * Priority propagation functions.
+ */
+
+static inline bool
+thread_priority_propagation_needed(void)
+{
+    return thread_self()->propagate_priority;
+}
+
+static inline void
+thread_set_priority_propagation_needed(void)
+{
+    thread_self()->propagate_priority = true;
+}
+
+void thread_propagate_priority(void);
+
+/*
  * Migration control functions.
  *
  * Functions that change the migration state are implicit compiler barriers.
@@ -444,6 +537,10 @@ thread_preempt_enable_no_resched(void)
     thread = thread_self();
     assert(thread->preempt != 0);
     thread->preempt--;
+
+    if (thread_preempt_enabled() && thread_priority_propagation_needed()) {
+        thread_propagate_priority();
+    }
 }
 
 static inline void
diff --git a/kern/thread_i.h b/kern/thread_i.h
index 55f96082..865787a0 100644
--- a/kern/thread_i.h
+++ b/kern/thread_i.h
@@ -26,6 +26,7 @@
 #include <kern/macros.h>
 #include <kern/mutex_types.h>
 #include <kern/param.h>
+#include <kern/turnstile_types.h>
 #include <machine/atomic.h>
 #include <machine/tcb.h>
 
@@ -103,6 +104,19 @@ struct thread {
     /* Sleep queue available for lending */
     struct sleepq *priv_sleepq;
 
+    /* Turnstile available for lending */
+    struct turnstile *priv_turnstile;
+
+    /* Per-thread turnstile data */
+    struct turnstile_td turnstile_td;
+
+    /*
+     * True if priority must be propagated when preemption is reenabled
+     *
+     * This member is thread-local.
+     */
+    bool propagate_priority;
+
     /* Thread-local members */
     unsigned short preempt;
     unsigned short pinned;
@@ -111,8 +125,21 @@ struct thread {
     /* Processors on which this thread is allowed to run */
     struct cpumap cpumap;
 
-    /* Scheduling data */
-    struct thread_sched_data sched_data;
+    /*
+     * Scheduling data.
+     */
+    struct thread_sched_data user_sched_data; /* User-provided */
+    struct thread_sched_data real_sched_data; /* Computed from
+                                                 priority propagation */
+
+    /*
+     * True if the real scheduling data are not the user scheduling data.
+     *
+     * Note that it doesn't provide any information about priority inheritance.
+     * A thread may be part of a priority inheritance chain without its
+     * priority being boosted.
+     */
+    bool boosted;
 
     /* Class specific scheduling data */
     union {
diff --git a/kern/turnstile.c b/kern/turnstile.c
new file mode 100644
index 00000000..43629376
--- /dev/null
+++ b/kern/turnstile.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This implementation is based on "Solaris(tm) Internals: Solaris 10 and
+ * OpenSolaris Kernel Architecture, Second Edition" by Richard McDougall
+ * and Jim Mauro. See "Part Six: Platform Specifics, Chapter 17: Locking
+ * and Synchronization, Section 7 Turnstiles and Priority Inheritance".
+ *
+ * The differences are outlined below.
+ *
+ * This implementation doesn't support read/write locks, only mutual
+ * exclusion, because ownership doesn't apply well to read/write locks.
+ *
+ * Instead of using an external sleep queue object, this module implements
+ * that functionality itself. The reasons behind this decision are :
+ *  - the use of expensive priority lists used to queue threads, that
+ *    a simpler sleep queue implementation shouldn't use
+ *  - increased coupling with the scheduler
+ *
+ * Locking order : bucket (turnstile) -> turnstile_td -> thread_runq
+ *
+ * This order allows changing the owner of a turnstile without unlocking it
+ * which is important because a turnstile is normally used to synchronize
+ * access to the owner. Unlocking a turnstile would allow the owner to
+ * change and make complex transient states visible. The drawback is that
+ * a thread cannot be requeued atomically when its priority is changed.
+ * That deferred requeue is done during priority propagation.
+ *
+ * TODO Analyse hash parameters.
+ */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/init.h>
+#include <kern/kmem.h>
+#include <kern/list.h>
+#include <kern/macros.h>
+#include <kern/param.h>
+#include <kern/plist.h>
+#include <kern/spinlock.h>
+#include <kern/thread.h>
+#include <kern/turnstile.h>
+#include <kern/turnstile_types.h>
+
+/*
+ * Locking keys :
+ * (b) bucket
+ */
+struct turnstile_bucket {
+    struct spinlock lock;
+    struct list list;       /* (b) */
+} __aligned(CPU_L1_SIZE);
+
+/*
+ * Adding/removing waiters to/from a turnstile are performed while
+ * holding both the waiter's thread data and the turnstile locks.
+ *
+ * Changing the owner of a turnstile and linking/unlinking the turnstile
+ * into/from the owner's list of owned turnstiles are done atomically,
+ * while holding both the owner's thread data and the turnstile locks.
+ *
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ *
+ * (*) A turnstile is referenced in thread data after/before being
+ * added/removed to/from its bucket. Referencing a turnstile in
+ * thread data requires holding the thread data lock. This implies
+ * that, while holding the thread data lock, if the referenced
+ * turnstile isn't NULL, the bucket pointer is also not NULL and
+ * stable.
+ */
+struct turnstile {
+    struct turnstile_bucket *bucket;        /* (b*)  */
+    struct list node;                       /* (b)   */
+    const void *sync_obj;                   /* (b)   */
+    struct plist waiters;                   /* (b,t) */
+    struct turnstile *next_free;            /* (b)   */
+    struct turnstile_waiter *top_waiter;    /* (b,t) */
+    struct thread *owner;                   /* (b,t) */
+    struct plist_node td_node;              /* (t)   */
+};
+
+/*
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ */
+struct turnstile_waiter {
+    struct plist_node node; /* (b,t) */
+    struct thread *thread;  /* (b,t) */
+    bool awaken;            /* (b)   */
+};
+
+#define TURNSTILE_HTABLE_SIZE 128
+
+#if !ISP2(TURNSTILE_HTABLE_SIZE)
+#error hash table size must be a power of two
+#endif /* !ISP2(TURNSTILE_HTABLE_SIZE) */
+
+#define TURNSTILE_HTABLE_MASK (TURNSTILE_HTABLE_SIZE - 1)
+
+static struct turnstile_bucket turnstile_htable[TURNSTILE_HTABLE_SIZE];
+
+static struct kmem_cache turnstile_cache;
+
+static uintptr_t
+turnstile_hash(const void *addr)
+{
+    return ((uintptr_t)addr >> 8) ^ (uintptr_t)addr;
+}
+
+static void
+turnstile_waiter_init(struct turnstile_waiter *waiter, struct thread *thread)
+{
+    plist_node_init(&waiter->node, thread_real_global_priority(thread));
+    waiter->thread = thread;
+    waiter->awaken = false;
+}
+
+static unsigned int
+turnstile_waiter_priority(const struct turnstile_waiter *waiter)
+{
+    return plist_node_priority(&waiter->node);
+}
+
+static bool
+turnstile_waiter_awaken(const struct turnstile_waiter *waiter)
+{
+    return waiter->awaken;
+}
+
+static void
+turnstile_waiter_set_awaken(struct turnstile_waiter *waiter)
+{
+    waiter->awaken = true;
+}
+
+static void
+turnstile_waiter_clear_awaken(struct turnstile_waiter *waiter)
+{
+    waiter->awaken = false;
+}
+
+static void
+turnstile_waiter_wakeup(struct turnstile_waiter *waiter)
+{
+    if (turnstile_waiter_awaken(waiter)) {
+        return;
+    }
+
+    thread_wakeup(waiter->thread);
+    turnstile_waiter_set_awaken(waiter);
+}
+
+static void
+turnstile_update_top_waiter(struct turnstile *turnstile)
+{
+    if (turnstile_empty(turnstile)) {
+        turnstile->top_waiter = NULL;
+        return;
+    }
+
+    turnstile->top_waiter = plist_last_entry(&turnstile->waiters,
+                                             struct turnstile_waiter, node);
+}
+
+static void
+turnstile_add_waiter(struct turnstile *turnstile,
+                     struct turnstile_waiter *waiter)
+{
+    assert(!turnstile_waiter_awaken(waiter));
+    plist_add(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_remove_waiter(struct turnstile *turnstile,
+                        struct turnstile_waiter *waiter)
+{
+    assert(turnstile_waiter_awaken(waiter));
+    plist_remove(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_waiter_requeue(struct turnstile *turnstile,
+                         struct turnstile_waiter *waiter)
+{
+    unsigned int global_priority;
+
+    global_priority = thread_real_global_priority(waiter->thread);
+    assert(global_priority != plist_node_priority(&waiter->node));
+
+    plist_remove(&turnstile->waiters, &waiter->node);
+    plist_node_set_priority(&waiter->node, global_priority);
+    plist_add(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_td_set_waiter(struct turnstile_td *td,
+                        struct turnstile_waiter *waiter)
+{
+    td->waiter = waiter;
+}
+
+static struct turnstile_waiter *
+turnstile_td_get_waiter(const struct turnstile_td *td)
+{
+    return td->waiter;
+}
+
+static void
+turnstile_td_update_top_priority(struct turnstile_td *td)
+{
+    struct turnstile_waiter *top_waiter;
+    struct turnstile *top_turnstile;
+
+    if (plist_empty(&td->owned_turnstiles)) {
+        td->top_global_priority = 0;
+        return;
+    }
+
+    top_turnstile = plist_last_entry(&td->owned_turnstiles,
+                                     struct turnstile, td_node);
+    top_waiter = top_turnstile->top_waiter;
+
+    if (top_waiter == NULL) {
+        td->top_global_priority = 0;
+    } else {
+        td->top_global_priority = turnstile_waiter_priority(top_waiter);
+        td->top_sched_policy = thread_real_sched_policy(top_waiter->thread);
+        td->top_priority = thread_real_priority(top_waiter->thread);
+    }
+}
+
+static void
+turnstile_td_own(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    struct turnstile_waiter *top_waiter;
+    unsigned int top_priority;
+
+    assert(turnstile->owner == NULL);
+
+    top_waiter = turnstile->top_waiter;
+    assert(top_waiter != NULL);
+    top_priority = thread_real_global_priority(top_waiter->thread);
+    plist_node_init(&turnstile->td_node, top_priority);
+    plist_add(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile_td_update_top_priority(td);
+
+    turnstile->owner = structof(td, struct thread, turnstile_td);
+}
+
+static void
+turnstile_td_disown(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    assert(turnstile->owner == structof(td, struct thread, turnstile_td));
+
+    assert(!plist_node_unlinked(&turnstile->td_node));
+    plist_remove(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile_td_update_top_priority(td);
+
+    turnstile->owner = NULL;
+}
+
+static void
+turnstile_td_reown(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    assert(turnstile->owner == structof(td, struct thread, turnstile_td));
+    assert(!plist_node_unlinked(&turnstile->td_node));
+    plist_remove(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile->owner = NULL;
+    turnstile_td_own(td, turnstile);
+}
+
+/*
+ * The caller must hold the turnstile thread data lock and no turnstile
+ * locks when calling this function. The thread data are unlocked on return.
+ *
+ * In addition, this function drops a reference on the thread associated
+ * with the given thread data.
+ */
+static void
+turnstile_td_propagate_priority_loop(struct turnstile_td *td)
+{
+    unsigned int user_priority, real_priority, top_priority;
+    struct turnstile_waiter *waiter;
+    struct turnstile *turnstile;
+    struct thread *thread;
+    unsigned short priority;
+    unsigned char policy;
+    int error;
+
+    /*
+     * At the very least, this function must make sure that :
+     *  - the given thread has its intended priority, which is the
+     *    highest among its own and all the waiters in the turnstiles
+     *    it owns, and
+     *  - the thread is at its intended position in the turnstile it's
+     *    waiting on, if any.
+     */
+
+    for (;;) {
+        thread = structof(td, struct thread, turnstile_td);
+        user_priority = thread_user_global_priority(thread);
+        real_priority = thread_real_global_priority(thread);
+        top_priority = td->top_global_priority;
+
+        if (top_priority > user_priority) {
+            policy = td->top_sched_policy;
+            priority = td->top_priority;
+        } else {
+            top_priority = user_priority;
+            policy = thread_user_sched_policy(thread);
+            priority = thread_user_priority(thread);
+        }
+
+        if (top_priority != real_priority) {
+            thread_pi_setscheduler(thread, policy, priority);
+        }
+
+        waiter = turnstile_td_get_waiter(td);
+
+        if ((waiter == NULL)
+            || (top_priority == turnstile_waiter_priority(waiter))) {
+            spinlock_unlock(&td->lock);
+            thread_unref(thread);
+            return;
+        }
+
+        turnstile = turnstile_td_get_turnstile(td);
+        assert(turnstile != NULL);
+
+        error = spinlock_trylock(&turnstile->bucket->lock);
+
+        if (error) {
+            spinlock_unlock(&td->lock);
+            spinlock_lock(&td->lock);
+            continue;
+        }
+
+        /*
+         * This couldn't be done while changing the thread's priority
+         * because of locking restrictions. Do it now.
+         */
+        turnstile_waiter_requeue(turnstile, waiter);
+
+        spinlock_unlock(&td->lock);
+        thread_unref(thread);
+
+        thread = turnstile->owner;
+
+        if (thread == NULL) {
+            break;
+        }
+
+        td = thread_turnstile_td(thread);
+
+        thread_ref(thread);
+        spinlock_lock(&td->lock);
+
+        turnstile_td_reown(td, turnstile);
+
+        spinlock_unlock(&turnstile->bucket->lock);
+    }
+
+    spinlock_unlock(&turnstile->bucket->lock);
+}
+
+void
+turnstile_td_propagate_priority(struct turnstile_td *td)
+{
+    struct thread *thread;
+
+    thread = structof(td, struct thread, turnstile_td);
+
+    thread_ref(thread);
+    spinlock_lock(&td->lock);
+    turnstile_td_propagate_priority_loop(td);
+}
+
+static void
+turnstile_assert_init_state(const struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == NULL);
+    assert(turnstile->sync_obj == NULL);
+    assert(plist_empty(&turnstile->waiters));
+    assert(turnstile->next_free == NULL);
+    assert(turnstile->top_waiter == NULL);
+    assert(turnstile->owner == NULL);
+}
+
+static void
+turnstile_use(struct turnstile *turnstile, const void *sync_obj)
+{
+    assert(turnstile->sync_obj == NULL);
+    turnstile->sync_obj = sync_obj;
+}
+
+static void
+turnstile_unuse(struct turnstile *turnstile)
+{
+    assert(turnstile->sync_obj != NULL);
+    turnstile->sync_obj = NULL;
+}
+
+static bool
+turnstile_in_use(const struct turnstile *turnstile)
+{
+    return turnstile->sync_obj != NULL;
+}
+
+static bool
+turnstile_in_use_by(const struct turnstile *turnstile, const void *sync_obj)
+{
+    return turnstile->sync_obj == sync_obj;
+}
+
+static void
+turnstile_bucket_init(struct turnstile_bucket *bucket)
+{
+    spinlock_init(&bucket->lock);
+    list_init(&bucket->list);
+}
+
+static struct turnstile_bucket *
+turnstile_bucket_get(const void *sync_obj)
+{
+    uintptr_t index;
+
+    index = turnstile_hash(sync_obj) & TURNSTILE_HTABLE_MASK;
+    assert(index < ARRAY_SIZE(turnstile_htable));
+    return &turnstile_htable[index];
+}
+
+static void
+turnstile_bucket_add(struct turnstile_bucket *bucket,
+                     struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == NULL);
+    turnstile->bucket = bucket;
+    list_insert_tail(&bucket->list, &turnstile->node);
+}
+
+static void
+turnstile_bucket_remove(struct turnstile_bucket *bucket,
+                        struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == bucket);
+    turnstile->bucket = NULL;
+    list_remove(&turnstile->node);
+}
+
+static struct turnstile *
+turnstile_bucket_lookup(const struct turnstile_bucket *bucket,
+                        const void *sync_obj)
+{
+    struct turnstile *turnstile;
+
+    list_for_each_entry(&bucket->list, turnstile, node) {
+        if (turnstile_in_use_by(turnstile, sync_obj)) {
+            return turnstile;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+turnstile_ctor(void *ptr)
+{
+    struct turnstile *turnstile;
+
+    turnstile = ptr;
+    turnstile->bucket = NULL;
+    turnstile->sync_obj = NULL;
+    plist_init(&turnstile->waiters);
+    turnstile->next_free = NULL;
+    turnstile->top_waiter = NULL;
+    turnstile->owner = NULL;
+}
+
+void __init
+turnstile_setup(void)
+{
+    unsigned int i;
+
+    for (i = 0; i < ARRAY_SIZE(turnstile_htable); i++) {
+        turnstile_bucket_init(&turnstile_htable[i]);
+    }
+
+    kmem_cache_init(&turnstile_cache, "turnstile", sizeof(struct turnstile),
+                    CPU_L1_SIZE, turnstile_ctor, 0);
+}
+
+struct turnstile *
+turnstile_create(void)
+{
+    struct turnstile *turnstile;
+
+    turnstile = kmem_cache_alloc(&turnstile_cache);
+
+    if (turnstile == NULL) {
+        return NULL;
+    }
+
+    turnstile_assert_init_state(turnstile);
+    return turnstile;
+}
+
+void
+turnstile_destroy(struct turnstile *turnstile)
+{
+    turnstile_assert_init_state(turnstile);
+    kmem_cache_free(&turnstile_cache, turnstile);
+}
+
+struct turnstile *
+turnstile_acquire(const void *sync_obj)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *turnstile;
+
+    assert(sync_obj != NULL);
+
+    bucket = turnstile_bucket_get(sync_obj);
+
+    spinlock_lock(&bucket->lock);
+
+    turnstile = turnstile_bucket_lookup(bucket, sync_obj);
+
+    if (turnstile == NULL) {
+        spinlock_unlock(&bucket->lock);
+        return NULL;
+    }
+
+    return turnstile;
+}
+
+void
+turnstile_release(struct turnstile *turnstile)
+{
+    spinlock_unlock(&turnstile->bucket->lock);
+}
+
+static void
+turnstile_push_free(struct turnstile *turnstile,
+                    struct turnstile *free_turnstile)
+{
+    assert(free_turnstile->next_free == NULL);
+    free_turnstile->next_free = turnstile->next_free;
+    turnstile->next_free = free_turnstile;
+}
+
+static struct turnstile *
+turnstile_pop_free(struct turnstile *turnstile)
+{
+    struct turnstile *free_turnstile;
+
+    free_turnstile = turnstile->next_free;
+
+    if (free_turnstile == NULL) {
+        return NULL;
+    }
+
+    turnstile->next_free = free_turnstile->next_free;
+    free_turnstile->next_free = NULL;
+    return free_turnstile;
+}
+
+struct turnstile *
+turnstile_lend(const void *sync_obj)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *turnstile, *prev;
+    struct turnstile_td *td;
+
+    assert(sync_obj != NULL);
+
+    turnstile = thread_turnstile_lend();
+    turnstile_assert_init_state(turnstile);
+
+    td = thread_turnstile_td(thread_self());
+    bucket = turnstile_bucket_get(sync_obj);
+
+    spinlock_lock(&bucket->lock);
+
+    prev = turnstile_bucket_lookup(bucket, sync_obj);
+
+    if (prev == NULL) {
+        turnstile_use(turnstile, sync_obj);
+        turnstile_bucket_add(bucket, turnstile);
+    } else {
+        turnstile_push_free(prev, turnstile);
+        turnstile = prev;
+    }
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_turnstile(td, turnstile);
+    spinlock_unlock(&td->lock);
+
+    return turnstile;
+}
+
+void
+turnstile_return(struct turnstile *turnstile)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *free_turnstile;
+    struct turnstile_td *td;
+
+    assert(turnstile_in_use(turnstile));
+
+    td = thread_turnstile_td(thread_self());
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_turnstile(td, NULL);
+    spinlock_unlock(&td->lock);
+
+    bucket = turnstile->bucket;
+    free_turnstile = turnstile_pop_free(turnstile);
+
+    if (free_turnstile == NULL) {
+        turnstile_bucket_remove(bucket, turnstile);
+        turnstile_unuse(turnstile);
+        free_turnstile = turnstile;
+    }
+
+    spinlock_unlock(&bucket->lock);
+
+    turnstile_assert_init_state(free_turnstile);
+    thread_turnstile_return(free_turnstile);
+}
+
+bool
+turnstile_empty(const struct turnstile *turnstile)
+{
+    return plist_empty(&turnstile->waiters);
+}
+
+static void
+turnstile_set_owner(struct turnstile *turnstile, struct thread *owner)
+{
+    struct turnstile_td *td;
+
+    assert(owner != NULL);
+    assert((turnstile->owner == NULL) || (turnstile->owner == owner));
+
+    td = thread_turnstile_td(owner);
+
+    thread_ref(owner);
+    spinlock_lock(&td->lock);
+
+    if (turnstile->owner == NULL) {
+        turnstile_td_own(td, turnstile);
+    }
+
+    spinlock_unlock(&turnstile->bucket->lock);
+
+    turnstile_td_propagate_priority_loop(td);
+
+    spinlock_lock(&turnstile->bucket->lock);
+}
+
+void
+turnstile_wait(struct turnstile *turnstile, const char *wchan,
+               struct thread *owner)
+{
+    struct turnstile_waiter waiter;
+    struct turnstile_td *td;
+    struct thread *thread;
+
+    thread = thread_self();
+    assert(thread != owner);
+
+    td = thread_turnstile_td(thread);
+
+    spinlock_lock(&td->lock);
+    turnstile_waiter_init(&waiter, thread);
+    turnstile_add_waiter(turnstile, &waiter);
+    turnstile_td_set_waiter(td, &waiter);
+    spinlock_unlock(&td->lock);
+
+    if (owner == NULL) {
+        if (turnstile->top_waiter == &waiter) {
+            turnstile_waiter_set_awaken(&waiter);
+        }
+    } else {
+        /* This function temporarily unlocks the turnstile */
+        turnstile_set_owner(turnstile, owner);
+    }
+
+    for (;;) {
+        if (!turnstile_waiter_awaken(&waiter)) {
+            thread_sleep(&turnstile->bucket->lock, turnstile->sync_obj, wchan);
+        }
+
+        /*
+         * The real priority of a thread may change between waking up
+         * and reacquiring the turnstile.
+         */
+        if (turnstile->top_waiter == &waiter) {
+            break;
+        }
+
+        /* Otherwise, make sure the new top waiter is awaken */
+        turnstile_waiter_wakeup(turnstile->top_waiter);
+        turnstile_waiter_clear_awaken(&waiter);
+    }
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_waiter(td, NULL);
+    turnstile_remove_waiter(turnstile, &waiter);
+    spinlock_unlock(&td->lock);
+}
+
+void
+turnstile_signal(struct turnstile *turnstile)
+{
+    struct turnstile_waiter *waiter;
+
+    if (turnstile_empty(turnstile)) {
+        return;
+    }
+
+    waiter = plist_last_entry(&turnstile->waiters,
+                              struct turnstile_waiter, node);
+    turnstile_waiter_wakeup(waiter);
+}
+
+void
+turnstile_own(struct turnstile *turnstile)
+{
+    struct turnstile_td *td;
+    struct thread *owner;
+    unsigned int top_priority;
+
+    assert(turnstile->owner == NULL);
+
+    if (turnstile_empty(turnstile)) {
+        return;
+    }
+
+    owner = thread_self();
+    top_priority = turnstile_waiter_priority(turnstile->top_waiter);
+    assert(thread_real_global_priority(owner) >= top_priority);
+    td = thread_turnstile_td(owner);
+
+    spinlock_lock(&td->lock);
+    turnstile_td_own(td, turnstile);
+    spinlock_unlock(&td->lock);
+}
+
+void
+turnstile_disown(struct turnstile *turnstile)
+{
+    struct turnstile_td *td;
+    struct thread *owner;
+
+    owner = thread_self();
+    assert(turnstile->owner == owner);
+    assert(!turnstile_empty(turnstile));
+
+    td = thread_turnstile_td(owner);
+
+    spinlock_lock(&td->lock);
+    turnstile_td_disown(td, turnstile);
+    spinlock_unlock(&td->lock);
+}
diff --git a/kern/turnstile.h b/kern/turnstile.h
new file mode 100644
index 00000000..21a5a1a9
--- /dev/null
+++ b/kern/turnstile.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Priority propagation capable sleep queues.
+ *
+ * Turnstiles are used to build sleeping synchronization primitives where
+ * ownership applies, such as mutexes. They allow threads with different
+ * priorities to contend on the same synchronization object without
+ * unbounded priority inversion.
+ */
+
+#ifndef _KERN_TURNSTILE_H
+#define _KERN_TURNSTILE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include <kern/plist.h>
+#include <kern/spinlock.h>
+#include <kern/thread.h>
+#include <kern/turnstile_types.h>
+
+struct turnstile;
+
+/*
+ * Turnstile thread data.
+ */
+struct turnstile_td;
+
+#define turnstile_td_assert_lock(td) spinlock_assert_locked(&(td)->lock)
+
+/*
+ * Initialize turnstile thread data.
+ */
+static inline void
+turnstile_td_init(struct turnstile_td *td)
+{
+    spinlock_init(&td->lock);
+    td->turnstile = NULL;
+    td->waiter = NULL;
+    plist_init(&td->owned_turnstiles);
+    td->top_global_priority = 0;
+}
+
+/*
+ * Turnstile thread data locking functions.
+ */
+
+static inline void
+turnstile_td_lock(struct turnstile_td *td)
+{
+    spinlock_lock(&td->lock);
+}
+
+static inline int
+turnstile_td_trylock(struct turnstile_td *td)
+{
+    return spinlock_trylock(&td->lock);
+}
+
+static inline void
+turnstile_td_unlock(struct turnstile_td *td)
+{
+    spinlock_unlock(&td->lock);
+}
+
+/*
+ * Functions managing the turnstile a thread is sleeping in.
+ */
+
+static inline void
+turnstile_td_set_turnstile(struct turnstile_td *td,
+                           struct turnstile *turnstile)
+{
+    td->turnstile = turnstile;
+}
+
+static inline struct turnstile *
+turnstile_td_get_turnstile(const struct turnstile_td *td)
+{
+    return td->turnstile;
+}
+
+/*
+ * Propagate priority starting at the thread containing the given thread data.
+ */
+void turnstile_td_propagate_priority(struct turnstile_td *td);
+
+/*
+ * Initialize the turnstile module.
+ */
+void turnstile_setup(void);
+
+/*
+ * Create/destroy a turnstile.
+ */
+struct turnstile * turnstile_create(void);
+void turnstile_destroy(struct turnstile *turnstile);
+
+/*
+ * Acquire/release a turnstile.
+ *
+ * Acquiring a turnstile serializes all access and disables preemption.
+ */
+struct turnstile * turnstile_acquire(const void *sync_obj);
+void turnstile_release(struct turnstile *turnstile);
+
+/*
+ * Lend/return a turnstile.
+ *
+ * A thread lends its private turnstile to the turnstile module in
+ * order to prepare its sleep. The turnstile obtained on lending
+ * is either the thread's turnstile, or an already existing turnstile
+ * for this synchronization object if another thread is waiting.
+ *
+ * When multiple threads are waiting on the same turnstile, the extra
+ * turnstiles lent are kept in an internal free list, used when threads
+ * are awaken to return a turnstile to them.
+ *
+ * Note that the turnstile returned may not be the one lent.
+ *
+ * The turnstile obtained when lending is automatically acquired.
+ */
+struct turnstile * turnstile_lend(const void *sync_obj);
+void turnstile_return(struct turnstile *turnstile);
+
+/*
+ * Return true if the given turnstile has no waiters.
+ *
+ * The turnstile must be acquired when calling this function.
+ */
+bool turnstile_empty(const struct turnstile *turnstile);
+
+/*
+ * Wait for a wake up on the given turnstile.
+ *
+ * The turnstile must be lent when calling this function. It is
+ * released and later reacquired before returning from this function.
+ *
+ * The calling thread is considered a waiter as long as it didn't
+ * reacquire the turnstile. This means that signalling a turnstile
+ * has no visible effect on the number of waiters until the turnstile
+ * is released, e.g. if a single thread is waiting and another signals
+ * the turnstile, the turnstile is not immediately considered empty.
+ *
+ * If owner isn't NULL, it must refer to the thread currently owning
+ * the associated synchronization object. The priority of the caller
+ * is propagated to the chain of turnstiles and owners as necessary
+ * to prevent unbounded priority inversion.
+ */
+void turnstile_wait(struct turnstile *turnstile, const char *wchan,
+                    struct thread *owner);
+
+/*
+ * Wake up a thread waiting on the given turnstile, if any.
+ *
+ * The turnstile must be acquired when calling this function.
+ * Since a turnstile must be lent (and in turn is automatically
+ * acquired) when waiting, and acquired in order to signal it,
+ * wake-ups are serialized and cannot be missed.
+ */
+void turnstile_signal(struct turnstile *turnstile);
+
+/*
+ * Own/disown a turnstile.
+ *
+ * The turnstile must be lent when taking ownership, acquired when
+ * releasing it. Owning has no effect on empty turnstiles.
+ * Conversely, an empty turnstile cannot be disowned.
+ *
+ * Ownership must be updated atomically with regard to the ownership
+ * of the associated synchronization object.
+ */
+void turnstile_own(struct turnstile *turnstile);
+void turnstile_disown(struct turnstile *turnstile);
+
+#endif /* _KERN_TURNSTILE_H */
diff --git a/kern/turnstile_types.h b/kern/turnstile_types.h
new file mode 100644
index 00000000..a95b691e
--- /dev/null
+++ b/kern/turnstile_types.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_TURNSTILE_TYPES_H
+#define _KERN_TURNSTILE_TYPES_H
+
+#include <kern/plist_types.h>
+#include <kern/spinlock_types.h>
+
+struct turnstile;
+struct turnstile_waiter;
+
+/*
+ * Per-thread turnstile data.
+ *
+ * The turnstile member indicates whether this thread is in a turnstile,
+ * and is only valid if the thread is not running.
+ *
+ * The waiter points to the structure a thread uses to queue itself on
+ * a turnstile. It is used to access a sleeping thread from another
+ * thread, e.g. on wake-ups or priority updates.
+ *
+ * The list of owned turnstiles is used by priority propagation to
+ * determine the top priority among all waiters, which is stored in
+ * the thread data so that turnstiles are quickly unlocked.
+ *
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ */
+struct turnstile_td {
+    struct spinlock lock;
+    struct turnstile *turnstile;                /* (t)   */
+    struct turnstile_waiter *waiter;            /* (b,t) */
+    struct plist owned_turnstiles;              /* (t)   */
+    unsigned int top_global_priority;           /* (t)   */
+    unsigned char top_sched_policy;             /* (t)   */
+    unsigned short top_priority;                /* (t)   */
+};
+
+#endif /* _KERN_TURNSTILE_TYPES_H */
author	Richard Braun <rbraun@sceen.net>	2017-03-04 15:51:04 +0100
committer	Richard Braun <rbraun@sceen.net>	2017-03-04 16:47:59 +0100
commit	ef9426483f2f388c4874c6b12e0800645c3dbce4 (patch)
tree	0cc9eb58a28d8cf0bb02c8b38cced1a4f37ac73f /kern
parent	6fecd5cef7a2f549b4d81053a3b80365ed7828f5 (diff)