Merge branch 'sleep_rework'

author: Richard Braun <rbraun@sceen.net> 2017-03-04 17:02:09 +0100
committer: Richard Braun <rbraun@sceen.net> 2017-03-04 17:02:09 +0100
commit: 0b2db2776b3a4fb208f13867934147f8cef79b60 (patch)
tree: a685ad29009ecfe1aacc9901b67490fb2b6db48f
parent: cee25b55290865734d92a6b6262fe79887288b1a (diff)
parent: 4121baff3354f8bd1a4ae62b68af29c6f3965462 (diff)
28 files changed, 3654 insertions, 321 deletions
diff --git a/INSTALL b/INSTALL
index f9042109..97dcd148 100644
--- a/INSTALL
+++ b/INSTALL
@@ -243,6 +243,11 @@ X15 Options
      in a file named `test_pmap_update_mp.c' would be selected with the
      option `--enable-test-module=pmap_update_mp'.
 
+`--enable-mutex-pi'
+     Enable priority inheritance for regular mutexes (note that priority
+     inheritance is always enabled for real-time mutexes).
+     TODO Thoroughly describe the implications of this option
+
 `--with-max-cpus=MAX_CPUS'
      Set the maximum number of supported processors.
 
diff --git a/Makefrag.am b/Makefrag.am
index a42c5ea8..f0d8a393 100644
--- a/Makefrag.am
+++ b/Makefrag.am
@@ -41,6 +41,9 @@ x15_SOURCES +=                                              \
 	kern/param.h                                        \
 	kern/percpu.c                                       \
 	kern/percpu.h                                       \
+	kern/plist.c                                        \
+	kern/plist.h                                        \
+	kern/plist_types.h                                  \
 	kern/printk.c                                       \
 	kern/printk.h                                       \
 	kern/rbtree.c                                       \
@@ -49,6 +52,12 @@ x15_SOURCES +=                                              \
 	kern/rdxtree.c                                      \
 	kern/rdxtree.h                                      \
 	kern/rdxtree_i.h                                    \
+	kern/rtmutex.c                                      \
+	kern/rtmutex.h                                      \
+	kern/rtmutex_i.h                                    \
+	kern/rtmutex_types.h                                \
+	kern/sleepq.c                                       \
+	kern/sleepq.h                                       \
 	kern/spinlock.h                                     \
 	kern/spinlock_i.h                                   \
 	kern/spinlock_types.h                               \
@@ -64,6 +73,9 @@ x15_SOURCES +=                                              \
 	kern/thread.c                                       \
 	kern/thread.h                                       \
 	kern/thread_i.h                                     \
+	kern/turnstile.c                                    \
+	kern/turnstile.h                                    \
+	kern/turnstile_types.h                              \
 	kern/types.h                                        \
 	kern/work.c                                         \
 	kern/work.h                                         \
@@ -90,6 +102,10 @@ if TEST_LLSYNC_DEFER
 x15_SOURCES += test/test_llsync_defer.c
 endif TEST_LLSYNC_DEFER
 
+if TEST_MUTEX_PI
+x15_SOURCES += test/test_mutex_pi.c
+endif TEST_MUTEX_PI
+
 if TEST_PMAP_UPDATE_MP
 x15_SOURCES += test/test_pmap_update_mp.c
 endif TEST_PMAP_UPDATE_MP
diff --git a/configure.ac b/configure.ac
index 023140bd..0b32388f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -34,12 +34,19 @@ AC_ARG_WITH([max-cpus],
             [opt_max_cpus=$withval],
             [opt_max_cpus=128])
 
+AC_ARG_ENABLE([mutex-pi],
+              [AS_HELP_STRING([--enable-mutex-pi],
+                              [enable priority inheritance for regular mutexes
+                               (note that priority inheritance is always
+                                enabled for real-time mutexes)])])
+
 AC_DEFINE([__KERNEL__], [1], [kernel code])
 AC_DEFINE_UNQUOTED([ARCH], [$arch], [arch])
 
 m4_define([ENABLE_TEST_MODULE],
           [AS_CASE(["$enable_test_module"],
                    [llsync_defer], [test_llsync_defer=yes],
+                   [mutex_pi], [test_mutex_pi=yes],
                    [pmap_update_mp], [test_pmap_update_mp=yes],
                    [sref_dirty_zeroes], [test_sref_dirty_zeroes=yes],
                    [sref_noref], [test_sref_noref=yes],
@@ -55,6 +62,8 @@ m4_define([ENABLE_TEST_MODULE],
 AS_IF([test x"$enable_test_module" != xno], [ENABLE_TEST_MODULE])
 AM_CONDITIONAL([TEST_LLSYNC_DEFER],
                [test x"$test_llsync_defer" = xyes])
+AM_CONDITIONAL([TEST_MUTEX_PI],
+               [test x"$test_mutex_pi" = xyes])
 AM_CONDITIONAL([TEST_PMAP_UPDATE_MP],
                [test x"$test_pmap_update_mp" = xyes])
 AM_CONDITIONAL([TEST_SREF_DIRTY_ZEROES],
@@ -74,6 +83,10 @@ AC_DEFINE_UNQUOTED([MAX_CPUS], [$opt_max_cpus],
                    [maximum number of supported processors])
 AC_MSG_NOTICE([maximum number of supported processors: $opt_max_cpus])
 
+AS_IF([test x"$enable_mutex_pi" = xyes],
+      [AC_DEFINE_UNQUOTED([X15_MUTEX_PI], [],
+                          [Enable priority inheritance for regular mutexes])])
+
 AH_BOTTOM([#include <kern/config.h>])
 AC_CONFIG_HEADER([config.h])
 AC_CONFIG_FILES([Makefile])
diff --git a/kern/condition.c b/kern/condition.c
index 467cb85a..bcd5ba42 100644
--- a/kern/condition.c
+++ b/kern/condition.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Richard Braun.
+ * Copyright (c) 2013-2017 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,124 +15,178 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  *
- * In order to avoid the infamous thundering herd problem, this implementation
- * doesn't wake all threads when broadcasting a condition. Instead, they are
- * queued on the mutex associated with the condition, and an attempt to wake
- * one by locking and unlocking the mutex is performed. If the mutex is already
- * locked, the current owner does the same when unlocking.
+ * Locking order : mutex -> sleep queue
  */
 
 #include <stddef.h>
 
 #include <kern/assert.h>
 #include <kern/condition.h>
-#include <kern/list.h>
+#include <kern/condition_types.h>
 #include <kern/mutex.h>
-#include <kern/mutex_i.h>
-#include <kern/spinlock.h>
+#include <kern/sleepq.h>
 #include <kern/thread.h>
 
-void
-condition_wait(struct condition *condition, struct mutex *mutex)
+static void
+condition_inc_nr_sleeping_waiters(struct condition *condition)
 {
-    struct mutex_waiter waiter;
-    unsigned int state;
+    condition->nr_sleeping_waiters++;
+    assert(condition->nr_sleeping_waiters != 0);
+}
 
-    waiter.thread = thread_self();
+static void
+condition_dec_nr_sleeping_waiters(struct condition *condition)
+{
+    assert(condition->nr_sleeping_waiters != 0);
+    condition->nr_sleeping_waiters--;
+}
 
-    spinlock_lock(&condition->lock);
+static void
+condition_inc_nr_pending_waiters(struct condition *condition)
+{
+    condition->nr_pending_waiters++;
+    assert(condition->nr_pending_waiters != 0);
+}
 
-    assert((condition->mutex == NULL) || (condition->mutex == mutex));
+static void
+condition_dec_nr_pending_waiters(struct condition *condition)
+{
+    assert(condition->nr_pending_waiters != 0);
+    condition->nr_pending_waiters--;
+}
 
-    if (condition->mutex == NULL) {
-        condition->mutex = mutex;
-    }
+static void
+condition_move_waiters(struct condition *condition)
+{
+    unsigned short old;
 
-    list_insert_tail(&condition->waiters, &waiter.node);
+    assert(condition->nr_sleeping_waiters != 0);
+    old = condition->nr_pending_waiters;
+    condition->nr_pending_waiters += condition->nr_sleeping_waiters;
+    assert(old < condition->nr_pending_waiters);
+    condition->nr_sleeping_waiters = 0;
+}
 
-    spinlock_lock(&mutex->lock);
+void
+condition_wait(struct condition *condition, struct mutex *mutex)
+{
+    struct condition *last_cond;
+    struct sleepq *sleepq;
+
+    mutex_assert_locked(mutex);
+
+    /*
+     * Special case :
+     *
+     * mutex_lock(lock);
+     *
+     * for (;;) {
+     *     while (!done) {
+     *         condition_wait(condition, lock);
+     *     }
+     *
+     *     do_something();
+     * }
+     *
+     * Pull the last condition before unlocking the mutex to prevent
+     * mutex_unlock() from reacquiring the condition sleep queue.
+     */
+    last_cond = thread_pull_last_cond();
+
+    sleepq = sleepq_lend(condition, true);
+
+    mutex_unlock(mutex);
+
+    if (last_cond != NULL) {
+        assert(last_cond == condition);
+
+        if (condition->nr_pending_waiters != 0) {
+            sleepq_signal(sleepq);
+        }
+    }
 
-    state = mutex_release(mutex);
+    condition_inc_nr_sleeping_waiters(condition);
+    sleepq_wait(sleepq, "cond");
+    condition_dec_nr_pending_waiters(condition);
 
-    if (state == MUTEX_CONTENDED) {
-        mutex_signal(mutex);
+    if (condition->nr_pending_waiters != 0) {
+        thread_set_last_cond(condition);
     }
 
-    spinlock_unlock(&condition->lock);
+    sleepq_return(sleepq);
 
-    mutex_wait(mutex, &waiter);
-    mutex_trydowngrade(mutex);
-
-    spinlock_unlock(&mutex->lock);
+    mutex_lock(mutex);
 }
 
 void
 condition_signal(struct condition *condition)
 {
-    struct mutex_waiter *waiter;
-    struct mutex *mutex;
-    unsigned int state;
+    struct sleepq *sleepq;
 
-    spinlock_lock(&condition->lock);
+    sleepq = sleepq_acquire(condition, true);
 
-    if (condition->mutex == NULL) {
-        spinlock_unlock(&condition->lock);
+    if (sleepq == NULL) {
         return;
     }
 
-    mutex = condition->mutex;
-    waiter = list_first_entry(&condition->waiters, struct mutex_waiter, node);
-    list_remove(&waiter->node);
-
-    if (list_empty(&condition->waiters)) {
-        condition->mutex = NULL;
+    if (condition->nr_sleeping_waiters == 0) {
+        goto out;
     }
 
-    spinlock_unlock(&condition->lock);
-
-    spinlock_lock(&mutex->lock);
-
-    mutex_queue(mutex, waiter);
-    state = mutex_tryacquire_slow(mutex);
+    sleepq_signal(sleepq);
 
-    if (state == MUTEX_UNLOCKED) {
-        mutex_release(mutex);
-        mutex_signal(mutex);
-    }
+    condition_dec_nr_sleeping_waiters(condition);
+    condition_inc_nr_pending_waiters(condition);
 
-    spinlock_unlock(&mutex->lock);
+out:
+    sleepq_release(sleepq);
 }
 
 void
 condition_broadcast(struct condition *condition)
 {
-    struct list waiters;
-    struct mutex *mutex;
-    unsigned int state;
+    struct sleepq *sleepq;
 
-    spinlock_lock(&condition->lock);
+    sleepq = sleepq_acquire(condition, true);
 
-    if (condition->mutex == NULL) {
-        spinlock_unlock(&condition->lock);
+    if (sleepq == NULL) {
         return;
     }
 
-    mutex = condition->mutex;
-    condition->mutex = NULL;
-    list_set_head(&waiters, &condition->waiters);
-    list_init(&condition->waiters);
+    if (condition->nr_sleeping_waiters == 0) {
+        goto out;
+    }
 
-    spinlock_unlock(&condition->lock);
+    sleepq_signal(sleepq);
 
-    spinlock_lock(&mutex->lock);
+    condition_move_waiters(condition);
 
-    mutex_queue_list(mutex, &waiters);
-    state = mutex_tryacquire_slow(mutex);
+out:
+    sleepq_release(sleepq);
+}
 
-    if (state == MUTEX_UNLOCKED) {
-        mutex_release(mutex);
-        mutex_signal(mutex);
+void
+condition_wakeup(struct condition *condition)
+{
+    struct sleepq *sleepq;
+
+    sleepq = sleepq_acquire(condition, true);
+
+    if (sleepq == NULL) {
+        return;
     }
 
-    spinlock_unlock(&mutex->lock);
+    if (condition->nr_pending_waiters == 0) {
+        goto out;
+    }
+
+    /*
+     * Rely on the FIFO ordering of sleep queues so that signalling multiple
+     * times always wakes up the same thread, as long as that thread didn't
+     * reacquire the sleep queue.
+     */
+    sleepq_signal(sleepq);
+
+out:
+    sleepq_release(sleepq);
 }
diff --git a/kern/condition.h b/kern/condition.h
index 55d37bf2..0ce8d94f 100644
--- a/kern/condition.h
+++ b/kern/condition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Richard Braun.
+ * Copyright (c) 2013-2017 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,34 +16,62 @@
  *
  *
  * Condition variables.
+ *
+ * A condition variable is a synchronization primitive used to wait
+ * until a predicate becomes true. Multiple threads can be waiting
+ * for this condition. In order to synchronize changes on the predicate
+ * with waiting and signalling, a condition variable must be associated
+ * with a mutex.
  */
 
 #ifndef _KERN_CONDITION_H
 #define _KERN_CONDITION_H
 
-#include <stddef.h>
-
 #include <kern/condition_types.h>
-#include <kern/list.h>
-#include <kern/spinlock.h>
+#include <kern/mutex_types.h>
 
-/*
- * Condition variable.
- */
 struct condition;
 
+/*
+ * Initialize a condition variable.
+ */
 static inline void
 condition_init(struct condition *condition)
 {
-    spinlock_init(&condition->lock);
-    condition->mutex = NULL;
-    list_init(&condition->waiters);
+    condition->nr_sleeping_waiters = 0;
+    condition->nr_pending_waiters = 0;
 }
 
-void condition_wait(struct condition *cond, struct mutex *mutex);
+/*
+ * Wait for a wake-up on the given condition variable.
+ *
+ * The associated mutex must be locked when calling this function.
+ * It is unlocked before waiting and relocked before returning.
+ */
+void condition_wait(struct condition *condition, struct mutex *mutex);
 
-void condition_signal(struct condition *cond);
+/*
+ * Wake up one (signal) or all (broadcast) threads waiting on a
+ * condition variable, if any.
+ *
+ * Although it is not necessary to hold the mutex associated to the
+ * condition variable when calling these functions, doing so guarantees
+ * that a wake-up done when changing the predicate cannot be missed by
+ * waiting threads.
+ */
+void condition_signal(struct condition *condition);
+void condition_broadcast(struct condition *condition);
 
-void condition_broadcast(struct condition *cond);
+/*
+ * Wake up a pending thread.
+ *
+ * This function isn't part of the standard condition variable interface.
+ * It is used to chain wake-ups to avoid the thundering herd effect.
+ * When broadcasting a condition variable, a single thread is actually
+ * awaken. Other threads become "pending waiters", still asleep but
+ * eligible for wake-up when the mutex associated to the condition variable,
+ * relocked when returning from condition_wait(), is finally unlocked.
+ */
+void condition_wakeup(struct condition *condition);
 
 #endif /* _KERN_CONDITION_H */
diff --git a/kern/condition_types.h b/kern/condition_types.h
index 03b22676..13a29205 100644
--- a/kern/condition_types.h
+++ b/kern/condition_types.h
@@ -21,14 +21,9 @@
 #ifndef _KERN_CONDITION_TYPES_H
 #define _KERN_CONDITION_TYPES_H
 
-#include <kern/list_types.h>
-#include <kern/mutex_types.h>
-#include <kern/spinlock_types.h>
-
 struct condition {
-    struct spinlock lock;
-    struct mutex *mutex;
-    struct list waiters;
+    unsigned short nr_sleeping_waiters;
+    unsigned short nr_pending_waiters;
 };
 
 #endif /* _KERN_CONDITION_TYPES_H */
diff --git a/kern/kernel.c b/kern/kernel.c
index b63be5d2..cd3d3e3b 100644
--- a/kern/kernel.c
+++ b/kern/kernel.c
@@ -20,9 +20,11 @@
 #include <kern/kernel.h>
 #include <kern/llsync.h>
 #include <kern/percpu.h>
+#include <kern/sleepq.h>
 #include <kern/sref.h>
 #include <kern/task.h>
 #include <kern/thread.h>
+#include <kern/turnstile.h>
 #include <kern/work.h>
 #include <kern/xcall.h>
 #include <machine/cpu.h>
@@ -41,6 +43,8 @@ kernel_main(void)
     cpumap_setup();
     xcall_setup();
     task_setup();
+    sleepq_setup();
+    turnstile_setup();
     thread_setup();
     work_setup();
     llsync_setup();
diff --git a/kern/mutex.c b/kern/mutex.c
index 58e24513..6a1af208 100644
--- a/kern/mutex.c
+++ b/kern/mutex.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Richard Braun.
+ * Copyright (c) 2013-2017 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,36 +15,54 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#ifndef X15_MUTEX_PI
+
+#include <stddef.h>
+
 #include <kern/mutex.h>
 #include <kern/mutex_i.h>
-#include <kern/spinlock.h>
-#include <kern/thread.h>
+#include <kern/sleepq.h>
 
 void
 mutex_lock_slow(struct mutex *mutex)
 {
-    struct mutex_waiter waiter;
+    struct sleepq *sleepq;
     unsigned int state;
 
-    spinlock_lock(&mutex->lock);
+    sleepq = sleepq_lend(mutex, false);
+
+    for (;;) {
+        state = atomic_swap_uint(&mutex->state, MUTEX_CONTENDED);
 
-    state = mutex_tryacquire_slow(mutex);
+        if (state == MUTEX_UNLOCKED) {
+            break;
+        }
 
-    if (state != MUTEX_UNLOCKED) {
-        waiter.thread = thread_self();
-        mutex_queue(mutex, &waiter);
-        mutex_wait(mutex, &waiter);
+        sleepq_wait(sleepq, "mutex");
     }
 
-    mutex_trydowngrade(mutex);
+    if (sleepq_empty(sleepq)) {
+        state = atomic_swap_uint(&mutex->state, MUTEX_LOCKED);
+        assert(state == MUTEX_CONTENDED);
+    }
 
-    spinlock_unlock(&mutex->lock);
+    sleepq_return(sleepq);
 }
 
 void
 mutex_unlock_slow(struct mutex *mutex)
 {
-    spinlock_lock(&mutex->lock);
-    mutex_signal(mutex);
-    spinlock_unlock(&mutex->lock);
+    struct sleepq *sleepq;
+
+    sleepq = sleepq_acquire(mutex, false);
+
+    if (sleepq == NULL) {
+        return;
+    }
+
+    sleepq_signal(sleepq);
+
+    sleepq_release(sleepq);
 }
+
+#endif /* X15_MUTEX_PI */
diff --git a/kern/mutex.h b/kern/mutex.h
index 64efa350..c8ae4639 100644
--- a/kern/mutex.h
+++ b/kern/mutex.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Richard Braun.
+ * Copyright (c) 2013-2017 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,33 +15,84 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  *
- * Mutual exclusion locks.
+ * Mutual exclusion sleep locks.
  *
  * Unlike spin locks, acquiring a mutex may make the calling thread sleep.
+ *
+ * TODO Adaptive spinning.
  */
 
 #ifndef _KERN_MUTEX_H
 #define _KERN_MUTEX_H
 
+#include <kern/mutex_types.h>
+
+#ifdef X15_MUTEX_PI
+
+#include <kern/rtmutex.h>
+
+struct mutex;
+
+#define mutex_assert_locked(mutex) rtmutex_assert_locked(&(mutex)->rtmutex)
+
+static inline void
+mutex_init(struct mutex *mutex)
+{
+    rtmutex_init(&mutex->rtmutex);
+}
+
+static inline int
+mutex_trylock(struct mutex *mutex)
+{
+    return rtmutex_trylock(&mutex->rtmutex);
+}
+
+static inline void
+mutex_lock(struct mutex *mutex)
+{
+    rtmutex_lock(&mutex->rtmutex);
+}
+
+static inline void
+mutex_unlock(struct mutex *mutex)
+{
+    rtmutex_unlock(&mutex->rtmutex);
+
+    /*
+     * If this mutex was used along with a condition variable, wake up
+     * a potential pending waiter. This must be done after the mutex is
+     * unlocked so that a higher priority thread can directly acquire it.
+     */
+    thread_wakeup_last_cond();
+}
+
+#else /* X15_MUTEX_PI */
+
 #include <kern/assert.h>
 #include <kern/error.h>
-#include <kern/list.h>
 #include <kern/mutex_i.h>
-#include <kern/mutex_types.h>
-#include <kern/spinlock.h>
+#include <kern/thread.h>
 
 struct mutex;
 
+#define mutex_assert_locked(mutex) assert((mutex)->state != MUTEX_UNLOCKED)
+
+/*
+ * Initialize a mutex.
+ */
 static inline void
 mutex_init(struct mutex *mutex)
 {
     mutex->state = MUTEX_UNLOCKED;
-    spinlock_init(&mutex->lock);
-    list_init(&mutex->waiters);
 }
 
-#define mutex_assert_locked(mutex) assert((mutex)->state != MUTEX_UNLOCKED)
-
+/*
+ * Attempt to lock the given mutex.
+ *
+ * This function may not sleep.
+ *
+ * Return 0 on success, ERROR_BUSY if the mutex is already locked.
+ */
 static inline int
 mutex_trylock(struct mutex *mutex)
 {
@@ -56,6 +107,14 @@ mutex_trylock(struct mutex *mutex)
     return ERROR_BUSY;
 }
 
+/*
+ * Lock a mutex.
+ *
+ * If the mutex is already locked, the calling thread sleeps until the
+ * mutex is unlocked.
+ *
+ * A mutex can only be locked once.
+ */
 static inline void
 mutex_lock(struct mutex *mutex)
 {
@@ -72,6 +131,12 @@ mutex_lock(struct mutex *mutex)
     mutex_lock_slow(mutex);
 }
 
+/*
+ * Unlock a mutex.
+ *
+ * The mutex must be locked, and must have been locked by the calling
+ * thread.
+ */
 static inline void
 mutex_unlock(struct mutex *mutex)
 {
@@ -79,13 +144,18 @@ mutex_unlock(struct mutex *mutex)
 
     state = mutex_release(mutex);
 
-    if (state == MUTEX_LOCKED) {
-        return;
+    if (state != MUTEX_LOCKED) {
+        assert(state == MUTEX_CONTENDED);
+        mutex_unlock_slow(mutex);
     }
 
-    assert(state == MUTEX_CONTENDED);
-
-    mutex_unlock_slow(mutex);
+    /*
+     * If this mutex was used along with a condition variable, wake up
+     * a potential pending waiter.
+     */
+    thread_wakeup_last_cond();
 }
 
+#endif /* X15_MUTEX_PI */
+
 #endif /* _KERN_MUTEX_H */
diff --git a/kern/mutex_i.h b/kern/mutex_i.h
index 5e78a4da..158d8f7f 100644
--- a/kern/mutex_i.h
+++ b/kern/mutex_i.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Richard Braun.
+ * Copyright (c) 2013-2017 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,25 +18,16 @@
 #ifndef _KERN_MUTEX_I_H
 #define _KERN_MUTEX_I_H
 
+#ifndef X15_MUTEX_PI
+
 #include <kern/assert.h>
-#include <kern/list.h>
 #include <kern/mutex_types.h>
-#include <kern/thread.h>
 #include <machine/atomic.h>
 
 #define MUTEX_UNLOCKED  0
 #define MUTEX_LOCKED    1
 #define MUTEX_CONTENDED 2
 
-struct mutex_waiter {
-    struct list node;
-    struct thread *thread;
-};
-
-void mutex_lock_slow(struct mutex *mutex);
-
-void mutex_unlock_slow(struct mutex *mutex);
-
 static inline unsigned int
 mutex_tryacquire(struct mutex *mutex)
 {
@@ -44,12 +35,6 @@ mutex_tryacquire(struct mutex *mutex)
 }
 
 static inline unsigned int
-mutex_tryacquire_slow(struct mutex *mutex)
-{
-    return atomic_swap_uint(&mutex->state, MUTEX_CONTENDED);
-}
-
-static inline unsigned int
 mutex_release(struct mutex *mutex)
 {
     unsigned int state;
@@ -59,51 +44,10 @@ mutex_release(struct mutex *mutex)
     return state;
 }
 
-static inline void
-mutex_queue(struct mutex *mutex, struct mutex_waiter *waiter)
-{
-    list_insert_tail(&mutex->waiters, &waiter->node);
-}
-
-static inline void
-mutex_queue_list(struct mutex *mutex, struct list *waiters)
-{
-    list_concat(&mutex->waiters, waiters);
-}
-
-static inline void
-mutex_wait(struct mutex *mutex, struct mutex_waiter *waiter)
-{
-    unsigned int state;
-
-    do {
-        thread_sleep(&mutex->lock, mutex, "mutex");
-        state = mutex_tryacquire_slow(mutex);
-    } while (state != MUTEX_UNLOCKED);
-
-    list_remove(&waiter->node);
-}
-
-static inline void
-mutex_signal(struct mutex *mutex)
-{
-    struct mutex_waiter *waiter;
-
-    if (!list_empty(&mutex->waiters)) {
-        waiter = list_first_entry(&mutex->waiters, struct mutex_waiter, node);
-        thread_wakeup(waiter->thread);
-    }
-}
+void mutex_lock_slow(struct mutex *mutex);
 
-static inline void
-mutex_trydowngrade(struct mutex *mutex)
-{
-    if (list_empty(&mutex->waiters)) {
-        unsigned int state;
+void mutex_unlock_slow(struct mutex *mutex);
 
-        state = atomic_swap_uint(&mutex->state, MUTEX_LOCKED);
-        assert(state == MUTEX_CONTENDED);
-    }
-}
+#endif /* X15_MUTEX_PI */
 
 #endif /* _KERN_MUTEX_I_H */
diff --git a/kern/mutex_types.h b/kern/mutex_types.h
index b348fb7b..4b7947fc 100644
--- a/kern/mutex_types.h
+++ b/kern/mutex_types.h
@@ -21,13 +21,24 @@
 #ifndef _KERN_MUTEX_TYPES_H
 #define _KERN_MUTEX_TYPES_H
 
-#include <kern/list_types.h>
-#include <kern/spinlock_types.h>
+#ifdef X15_MUTEX_PI
+
+#include <kern/rtmutex_types.h>
+
+/*
+ * Do not directly alias rtmutex to make sure they cannot be used
+ * with condition variables by mistake.
+ */
+struct mutex {
+    struct rtmutex rtmutex;
+};
+
+#else /* X15_MUTEX_PI */
 
 struct mutex {
     unsigned int state;
-    struct spinlock lock;
-    struct list waiters;
 };
 
+#endif /* X15_MUTEX_PI */
+
 #endif /* _KERN_MUTEX_TYPES_H */
diff --git a/kern/plist.c b/kern/plist.c
new file mode 100644
index 00000000..851de629
--- /dev/null
+++ b/kern/plist.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <kern/list.h>
+#include <kern/plist.h>
+
+void
+plist_add(struct plist *plist, struct plist_node *pnode)
+{
+    struct plist_node *next;
+
+    if (plist_empty(plist)) {
+        list_insert_head(&plist->list, &pnode->node);
+        list_insert_head(&plist->prio_list, &pnode->prio_node);
+        return;
+    }
+
+    list_for_each_entry(&plist->prio_list, next, prio_node) {
+        if (pnode->priority < next->priority) {
+            break;
+        }
+    }
+
+    if (list_end(&plist->prio_list, &next->prio_node)
+        || (pnode->priority != next->priority)) {
+        list_insert_before(&next->prio_node, &pnode->prio_node);
+    } else {
+        list_init(&pnode->prio_node);
+    }
+
+    list_insert_before(&next->node, &pnode->node);
+}
+
+void
+plist_remove(struct plist *plist, struct plist_node *pnode)
+{
+    struct plist_node *next;
+
+    if (!list_node_unlinked(&pnode->prio_node)) {
+        next = list_next_entry(pnode, node);
+
+        if (!list_end(&plist->list, &next->node)
+            && list_node_unlinked(&next->prio_node)) {
+            list_insert_after(&pnode->prio_node, &next->prio_node);
+        }
+
+        list_remove(&pnode->prio_node);
+    }
+
+    list_remove(&pnode->node);
+}
diff --git a/kern/plist.h b/kern/plist.h
new file mode 100644
index 00000000..dd8fdd00
--- /dev/null
+++ b/kern/plist.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Priority list.
+ *
+ * This container acts as a doubly-linked list sorted by priority in
+ * ascending order. All operations behave as with a regular linked list
+ * except insertion, which is O(k), k being the number of priorities
+ * among the entries.
+ */
+
+#ifndef _KERN_PLIST_H
+#define _KERN_PLIST_H
+
+#include <stdbool.h>
+
+#include <kern/list.h>
+#include <kern/macros.h>
+#include <kern/plist_types.h>
+
+/*
+ * Priority list.
+ */
+struct plist;
+
+/*
+ * Priority list node.
+ */
+struct plist_node;
+
+/*
+ * Static priority list initializer.
+ */
+#define PLIST_INITIALIZER(plist) \
+    { LIST_INITIALIZER((plist).list), LIST_INITIALIZER((plist).prio_list) }
+
+/*
+ * Initialize a priority list.
+ */
+static inline void
+plist_init(struct plist *plist)
+{
+    list_init(&plist->list);
+    list_init(&plist->prio_list);
+}
+
+/*
+ * Initialize a priority list node.
+ */
+static inline void
+plist_node_init(struct plist_node *pnode, unsigned int priority)
+{
+    pnode->priority = priority;
+    list_node_init(&pnode->node);
+    list_node_init(&pnode->prio_node);
+}
+
+/*
+ * Return the priority associated with a node.
+ */
+static inline unsigned int
+plist_node_priority(const struct plist_node *pnode)
+{
+    return pnode->priority;
+}
+
+/*
+ * Update the priority of an already initialized node.
+ */
+static inline void
+plist_node_set_priority(struct plist_node *pnode, unsigned int priority)
+{
+    pnode->priority = priority;
+}
+
+/*
+ * Return true if pnode is in no priority lists.
+ */
+static inline bool
+plist_node_unlinked(const struct plist_node *pnode)
+{
+    return list_node_unlinked(&pnode->node);
+}
+
+/*
+ * Macro that evaluates to the address of the structure containing the
+ * given node based on the given type and member.
+ */
+#define plist_entry(pnode, type, member) structof(pnode, type, member)
+
+/*
+ * Return the first node of a priority list.
+ */
+static inline struct plist_node *
+plist_first(const struct plist *plist)
+{
+    return list_first_entry(&plist->list, struct plist_node, node);
+}
+
+/*
+ * Return the last node of a priority list.
+ */
+static inline struct plist_node *
+plist_last(const struct plist *plist)
+{
+    return list_last_entry(&plist->list, struct plist_node, node);
+}
+
+/*
+ * Return the node next to the given node.
+ */
+static inline struct plist_node *
+plist_next(const struct plist_node *pnode)
+{
+    return (struct plist_node *)list_next_entry(pnode, node);
+}
+
+/*
+ * Return the node previous to the given node.
+ */
+static inline struct plist_node *
+plist_prev(const struct plist_node *pnode)
+{
+    return (struct plist_node *)list_prev_entry(pnode, node);
+}
+
+/*
+ * Get the first entry of a priority list.
+ */
+#define plist_first_entry(plist, type, member) \
+    plist_entry(plist_first(plist), type, member)
+
+/*
+ * Get the last entry of a priority list.
+ */
+#define plist_last_entry(plist, type, member) \
+    plist_entry(plist_last(plist), type, member)
+
+/*
+ * Get the entry next to the given entry.
+ */
+#define plist_next_entry(entry, member) \
+    plist_entry(plist_next(&(entry)->member), typeof(*(entry)), member)
+
+/*
+ * Get the entry previous to the given entry.
+ */
+#define plist_prev_entry(entry, member) \
+    plist_entry(plist_prev(&(entry)->member), typeof(*(entry)), member)
+
+/*
+ * Return true if node is after the last or before the first node of
+ * a priority list.
+ */
+static inline bool
+plist_end(const struct plist *plist, const struct plist_node *pnode)
+{
+    return list_end(&plist->list, &pnode->node);
+}
+
+/*
+ * Return true if plist is empty.
+ */
+static inline bool
+plist_empty(const struct plist *plist)
+{
+    return list_empty(&plist->list);
+}
+
+/*
+ * Return true if plist contains exactly one node.
+ */
+static inline bool
+plist_singular(const struct plist *plist)
+{
+    return list_singular(&plist->list);
+}
+
+/*
+ * Add a node to a priority list.
+ *
+ * If the priority list already contains nodes with the same priority
+ * as the given node, it is inserted before them.
+ *
+ * The node must be initialized before calling this function.
+ */
+void plist_add(struct plist *plist, struct plist_node *pnode);
+
+/*
+ * Remove a node from a priority list.
+ *
+ * After completion, the node is stale.
+ */
+void plist_remove(struct plist *plist, struct plist_node *pnode);
+
+/*
+ * Forge a loop to process all nodes of a priority list.
+ *
+ * The node must not be altered during the loop.
+ */
+#define plist_for_each(plist, pnode)    \
+for (pnode = plist_first(plist);        \
+     !plist_end(plist, pnode);          \
+     pnode = plist_next(pnode))
+
+/*
+ * Forge a loop to process all nodes of a priority list.
+ */
+#define plist_for_each_safe(plist, pnode, tmp)              \
+for (pnode = plist_first(plist), tmp = plist_next(pnode);   \
+     !plist_end(plist, pnode);                              \
+     pnode = tmp, tmp = plist_next(pnode))
+
+/*
+ * Version of plist_for_each() that processes nodes backward.
+ */
+#define plist_for_each_reverse(plist, pnode)    \
+for (pnode = plist_last(plist);                 \
+     !plist_end(plist, pnode);                  \
+     pnode = plist_prev(pnode))
+
+/*
+ * Version of plist_for_each_safe() that processes nodes backward.
+ */
+#define plist_for_each_reverse_safe(plist, pnode, tmp)      \
+for (pnode = plist_last(plist), tmp = plist_prev(pnode);    \
+     !plist_end(plist, pnode);                              \
+     pnode = tmp, tmp = plist_prev(pnode))
+
+/*
+ * Forge a loop to process all entries of a priority list.
+ *
+ * The entry node must not be altered during the loop.
+ */
+#define plist_for_each_entry(plist, entry, member) \
+    list_for_each_entry(&(plist)->list, entry, member.node)
+
+/*
+ * Forge a loop to process all entries of a priority list.
+ */
+#define plist_for_each_entry_safe(plist, entry, tmp, member) \
+    list_for_each_entry_safe(&(plist)->list, entry, tmp, member.node)
+
+/*
+ * Version of plist_for_each_entry() that processes entries backward.
+ */
+#define plist_for_each_entry_reverse(plist, entry, member) \
+    list_for_each_entry_reverse(&(plist)->list, entry, member.node)
+
+/*
+ * Version of plist_for_each_entry_safe() that processes entries backward.
+ */
+#define plist_for_each_entry_reverse_safe(plist, entry, tmp, member) \
+    list_for_each_entry_reverse_safe(&(plist)->list, entry, tmp, member.node)
+
+#endif /* _KERN_PLIST_H */
diff --git a/kern/plist_types.h b/kern/plist_types.h
new file mode 100644
index 00000000..653d63f2
--- /dev/null
+++ b/kern/plist_types.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_PLIST_TYPES_H
+#define _KERN_PLIST_TYPES_H
+
+#include <kern/list_types.h>
+
+/*
+ * The list member is used as the underlying regular linked list and
+ * contains all entries, sorted by priority in ascending order. The
+ * prio_list member contains exactly one entry for each priority
+ * present, also sorted by priority in ascending order. An entry
+ * on prio_list is the first entry in list for that priority.
+ * Here is a representation of a possible priority list :
+ *
+ *      list--|1|--|3|--|3|--|3|--|4|--|6|--|6|--|8|
+ *            | |  | |            | |  | |       |
+ * prio_list--+ +--+ +------------+ +--+ +-------+
+ *
+ */
+struct plist {
+    struct list list;
+    struct list prio_list;
+};
+
+struct plist_node {
+    unsigned int priority;
+    struct list node;
+    struct list prio_node;
+};
+
+#endif /* _KERN_PLIST_TYPES_H */
diff --git a/kern/rtmutex.c b/kern/rtmutex.c
new file mode 100644
index 00000000..f79f8a88
--- /dev/null
+++ b/kern/rtmutex.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/rtmutex.h>
+#include <kern/rtmutex_i.h>
+#include <kern/rtmutex_types.h>
+#include <kern/thread.h>
+#include <kern/turnstile.h>
+#include <machine/atomic.h>
+
+static void
+rtmutex_set_contended(struct rtmutex *rtmutex)
+{
+    uintptr_t owner, prev_owner;
+
+    do {
+        owner = rtmutex->owner;
+        prev_owner = atomic_cas_uintptr(&rtmutex->owner, owner,
+                                        owner | RTMUTEX_CONTENDED);
+    } while (prev_owner != owner);
+}
+
+void
+rtmutex_lock_slow(struct rtmutex *rtmutex)
+{
+    struct turnstile *turnstile;
+    uintptr_t owner, prev_owner;
+    struct thread *thread;
+    uintptr_t bits;
+
+    owner = (uintptr_t)thread_self();
+
+    turnstile = turnstile_lend(rtmutex);
+
+    rtmutex_set_contended(rtmutex);
+
+    bits = RTMUTEX_CONTENDED;
+
+    for (;;) {
+        prev_owner = atomic_cas_uintptr(&rtmutex->owner, bits, owner | bits);
+        assert((prev_owner & bits) == bits);
+
+        if (prev_owner == bits) {
+            break;
+        }
+
+        thread = (struct thread *)(prev_owner & RTMUTEX_OWNER_MASK);
+        turnstile_wait(turnstile, "rtmutex", thread);
+        bits |= RTMUTEX_FORCE_WAIT;
+    }
+
+    turnstile_own(turnstile);
+
+    if (turnstile_empty(turnstile)) {
+        prev_owner = atomic_swap_uintptr(&rtmutex->owner, owner);
+        assert(prev_owner == (owner | bits));
+    }
+
+    turnstile_return(turnstile);
+
+    /*
+     * A lock owner should never perform priority propagation on itself,
+     * because this process is done using its own priority, potentially
+     * introducing unbounded priority inversion.
+     * Instead, let new waiters do it, using their own priority.
+     */
+}
+
+void
+rtmutex_unlock_slow(struct rtmutex *rtmutex)
+{
+    struct turnstile *turnstile;
+    uintptr_t owner, prev_owner;
+
+    owner = (uintptr_t)thread_self();
+
+    turnstile = turnstile_acquire(rtmutex);
+    assert(turnstile != NULL);
+
+    prev_owner = atomic_swap_uintptr(&rtmutex->owner,
+                                     RTMUTEX_FORCE_WAIT | RTMUTEX_CONTENDED);
+    assert((prev_owner & RTMUTEX_OWNER_MASK) == owner);
+
+    turnstile_disown(turnstile);
+    turnstile_signal(turnstile);
+
+    turnstile_release(turnstile);
+
+    thread_propagate_priority();
+}
diff --git a/kern/rtmutex.h b/kern/rtmutex.h
new file mode 100644
index 00000000..f7cee716
--- /dev/null
+++ b/kern/rtmutex.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Real-time mutual exclusion locks.
+ *
+ * A real-time mutex is similar to a regular mutex, except priority
+ * inheritance is unconditionnally enabled.
+ */
+
+#ifndef _KERN_RTMUTEX_H
+#define _KERN_RTMUTEX_H
+
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/error.h>
+#include <kern/rtmutex_i.h>
+#include <kern/rtmutex_types.h>
+
+struct rtmutex;
+
+#define rtmutex_assert_locked(rtmutex) assert((rtmutex)->owner != 0)
+
+/*
+ * Initialize a real-time mutex.
+ */
+static inline void
+rtmutex_init(struct rtmutex *rtmutex)
+{
+    rtmutex->owner = 0;
+}
+
+/*
+ * Attempt to lock the given real-time mutex.
+ *
+ * This function may not sleep.
+ *
+ * Return 0 on success, ERROR_BUSY if the mutex is already locked.
+ */
+static inline int
+rtmutex_trylock(struct rtmutex *rtmutex)
+{
+    uintptr_t prev_owner;
+
+    prev_owner = rtmutex_tryacquire(rtmutex);
+
+    if (prev_owner == 0) {
+        return 0;
+    }
+
+    return ERROR_BUSY;
+}
+
+/*
+ * Lock a real-time mutex.
+ *
+ * If the mutex is already locked, the calling thread sleeps until the
+ * mutex is unlocked, and its priority is propagated as needed to prevent
+ * unbounded priority inversion.
+ *
+ * A mutex can only be locked once.
+ */
+static inline void
+rtmutex_lock(struct rtmutex *rtmutex)
+{
+    uintptr_t prev_owner;
+
+    prev_owner = rtmutex_tryacquire(rtmutex);
+
+    if (prev_owner == 0) {
+        return;
+    }
+
+    rtmutex_lock_slow(rtmutex);
+}
+
+/*
+ * Unlock a real-time mutex.
+ *
+ * The mutex must be locked, and must have been locked by the calling
+ * thread.
+ */
+static inline void
+rtmutex_unlock(struct rtmutex *rtmutex)
+{
+    uintptr_t prev_owner;
+
+    prev_owner = rtmutex_tryrelease(rtmutex);
+
+    if (!(prev_owner & RTMUTEX_CONTENDED)) {
+        return;
+    }
+
+    rtmutex_unlock_slow(rtmutex);
+}
+
+#endif /* _KERN_RTMUTEX_H */
diff --git a/kern/rtmutex_i.h b/kern/rtmutex_i.h
new file mode 100644
index 00000000..b99b5bd0
--- /dev/null
+++ b/kern/rtmutex_i.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _KERN_RTMUTEX_I_H
+#define _KERN_RTMUTEX_I_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/rtmutex_types.h>
+#include <kern/thread.h>
+#include <machine/atomic.h>
+
+/*
+ * Real-time mutex flags.
+ *
+ * The "contended" flag indicates that threads are waiting for the mutex
+ * to be unlocked. It forces threads trying to lock the mutex as well as
+ * the owner to take the slow path.
+ *
+ * The "force-wait" flag prevents "stealing" a mutex. When a contended
+ * mutex is unlocked, a thread may concurrently try to lock it. Without
+ * this flag, it may succeed, and in doing so, it would prevent a
+ * potentially higher priority thread from locking the mutex. The flag
+ * forces all threads to not only take the slow path, but to also call
+ * the turnstile wait function so that only the highest priority thread
+ * may lock the mutex.
+ */
+#define RTMUTEX_CONTENDED   0x1
+#define RTMUTEX_FORCE_WAIT  0x2
+
+#define RTMUTEX_OWNER_MASK  (~((uintptr_t)(RTMUTEX_FORCE_WAIT \
+                                           | RTMUTEX_CONTENDED)))
+
+#define rtmutex_assert_owner_aligned(owner) \
+    assert(((owner) & ~RTMUTEX_OWNER_MASK) == 0)
+
+static inline uintptr_t
+rtmutex_tryacquire(struct rtmutex *rtmutex)
+{
+    uintptr_t owner;
+
+    owner = (uintptr_t)thread_self();
+    rtmutex_assert_owner_aligned(owner);
+    return atomic_cas_uintptr(&rtmutex->owner, 0, owner);
+}
+
+static inline uintptr_t
+rtmutex_tryrelease(struct rtmutex *rtmutex)
+{
+    uintptr_t owner, prev_owner;
+
+    owner = (uintptr_t)thread_self();
+    rtmutex_assert_owner_aligned(owner);
+    prev_owner = atomic_cas_uintptr(&rtmutex->owner, owner, 0);
+    assert((prev_owner & RTMUTEX_OWNER_MASK) == owner);
+    return prev_owner;
+}
+
+void rtmutex_lock_slow(struct rtmutex *rtmutex);
+
+void rtmutex_unlock_slow(struct rtmutex *rtmutex);
+
+#endif /* _KERN_RTMUTEX_I_H */
diff --git a/kern/rtmutex_types.h b/kern/rtmutex_types.h
new file mode 100644
index 00000000..c03e0484
--- /dev/null
+++ b/kern/rtmutex_types.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_RTMUTEX_TYPES_H
+#define _KERN_RTMUTEX_TYPES_H
+
+#include <stdint.h>
+
+struct rtmutex {
+    uintptr_t owner;
+};
+
+#endif /* _KERN_RTMUTEX_TYPES_H */
diff --git a/kern/sleepq.c b/kern/sleepq.c
new file mode 100644
index 00000000..0c41caef
--- /dev/null
+++ b/kern/sleepq.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * TODO Analyse hash parameters.
+ */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/init.h>
+#include <kern/kmem.h>
+#include <kern/list.h>
+#include <kern/macros.h>
+#include <kern/param.h>
+#include <kern/sleepq.h>
+#include <kern/spinlock.h>
+#include <kern/thread.h>
+
+struct sleepq_bucket {
+    struct spinlock lock;
+    struct list list;
+} __aligned(CPU_L1_SIZE);
+
+struct sleepq {
+    struct sleepq_bucket *bucket;
+    struct list node;
+    const void *sync_obj;
+    struct list waiters;
+    struct sleepq *next_free;
+};
+
+struct sleepq_waiter {
+    struct list node;
+    struct thread *thread;
+};
+
+#define SLEEPQ_HTABLE_SIZE      128
+#define SLEEPQ_COND_HTABLE_SIZE 64
+
+#if !ISP2(SLEEPQ_HTABLE_SIZE) || !ISP2(SLEEPQ_COND_HTABLE_SIZE)
+#error hash table size must be a power of two
+#endif /* !ISP2(SLEEPQ_HTABLE_SIZE) */
+
+#define SLEEPQ_HTABLE_MASK      (SLEEPQ_HTABLE_SIZE - 1)
+#define SLEEPQ_COND_HTABLE_MASK (SLEEPQ_COND_HTABLE_SIZE - 1)
+
+static struct sleepq_bucket sleepq_htable[SLEEPQ_HTABLE_SIZE];
+static struct sleepq_bucket sleepq_cond_htable[SLEEPQ_COND_HTABLE_SIZE];
+
+static struct kmem_cache sleepq_cache;
+
+static uintptr_t
+sleepq_hash(const void *addr)
+{
+    return ((uintptr_t)addr >> 8) ^ (uintptr_t)addr;
+}
+
+static void
+sleepq_waiter_init(struct sleepq_waiter *waiter, struct thread *thread)
+{
+    waiter->thread = thread;
+}
+
+static void
+sleepq_waiter_wakeup(struct sleepq_waiter *waiter)
+{
+    thread_wakeup(waiter->thread);
+}
+
+static void
+sleepq_assert_init_state(const struct sleepq *sleepq)
+{
+    assert(sleepq->bucket == NULL);
+    assert(sleepq->sync_obj == NULL);
+    assert(list_empty(&sleepq->waiters));
+    assert(sleepq->next_free == NULL);
+}
+
+static void
+sleepq_use(struct sleepq *sleepq, const void *sync_obj)
+{
+    assert(sleepq->sync_obj == NULL);
+    sleepq->sync_obj = sync_obj;
+}
+
+static void
+sleepq_unuse(struct sleepq *sleepq)
+{
+    assert(sleepq->sync_obj != NULL);
+    sleepq->sync_obj = NULL;
+}
+
+static bool
+sleepq_in_use(const struct sleepq *sleepq)
+{
+    return sleepq->sync_obj != NULL;
+}
+
+static bool
+sleepq_in_use_by(const struct sleepq *sleepq, const void *sync_obj)
+{
+    return sleepq->sync_obj == sync_obj;
+}
+
+static void
+sleepq_bucket_init(struct sleepq_bucket *bucket)
+{
+    spinlock_init(&bucket->lock);
+    list_init(&bucket->list);
+}
+
+static struct sleepq_bucket *
+sleepq_bucket_get_cond(const void *sync_obj)
+{
+    uintptr_t index;
+
+    index = sleepq_hash(sync_obj) & SLEEPQ_COND_HTABLE_MASK;
+    assert(index < ARRAY_SIZE(sleepq_cond_htable));
+    return &sleepq_cond_htable[index];
+}
+
+static struct sleepq_bucket *
+sleepq_bucket_get(const void *sync_obj, bool condition)
+{
+    uintptr_t index;
+
+    if (condition) {
+        return sleepq_bucket_get_cond(sync_obj);
+    }
+
+    index = sleepq_hash(sync_obj) & SLEEPQ_HTABLE_MASK;
+    assert(index < ARRAY_SIZE(sleepq_htable));
+    return &sleepq_htable[index];
+}
+
+static void
+sleepq_bucket_add(struct sleepq_bucket *bucket, struct sleepq *sleepq)
+{
+    assert(sleepq->bucket == NULL);
+    sleepq->bucket = bucket;
+    list_insert_tail(&bucket->list, &sleepq->node);
+}
+
+static void
+sleepq_bucket_remove(struct sleepq_bucket *bucket, struct sleepq *sleepq)
+{
+    assert(sleepq->bucket == bucket);
+    sleepq->bucket = NULL;
+    list_remove(&sleepq->node);
+}
+
+static struct sleepq *
+sleepq_bucket_lookup(const struct sleepq_bucket *bucket, const void *sync_obj)
+{
+    struct sleepq *sleepq;
+
+    list_for_each_entry(&bucket->list, sleepq, node) {
+        if (sleepq_in_use_by(sleepq, sync_obj)) {
+            assert(sleepq->bucket == bucket);
+            return sleepq;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+sleepq_ctor(void *ptr)
+{
+    struct sleepq *sleepq;
+
+    sleepq = ptr;
+    sleepq->bucket = NULL;
+    sleepq->sync_obj = NULL;
+    list_init(&sleepq->waiters);
+    sleepq->next_free = NULL;
+}
+
+void __init
+sleepq_setup(void)
+{
+    unsigned int i;
+
+    for (i = 0; i < ARRAY_SIZE(sleepq_htable); i++) {
+        sleepq_bucket_init(&sleepq_htable[i]);
+    }
+
+    for (i = 0; i < ARRAY_SIZE(sleepq_cond_htable); i++) {
+        sleepq_bucket_init(&sleepq_cond_htable[i]);
+    }
+
+    kmem_cache_init(&sleepq_cache, "sleepq", sizeof(struct sleepq),
+                    CPU_L1_SIZE, sleepq_ctor, 0);
+}
+
+struct sleepq *
+sleepq_create(void)
+{
+    struct sleepq *sleepq;
+
+    sleepq = kmem_cache_alloc(&sleepq_cache);
+
+    if (sleepq == NULL) {
+        return NULL;
+    }
+
+    sleepq_assert_init_state(sleepq);
+    return sleepq;
+}
+
+void
+sleepq_destroy(struct sleepq *sleepq)
+{
+    sleepq_assert_init_state(sleepq);
+    kmem_cache_free(&sleepq_cache, sleepq);
+}
+
+struct sleepq *
+sleepq_acquire(const void *sync_obj, bool condition)
+{
+    struct sleepq_bucket *bucket;
+    struct sleepq *sleepq;
+
+    assert(sync_obj != NULL);
+
+    bucket = sleepq_bucket_get(sync_obj, condition);
+
+    spinlock_lock(&bucket->lock);
+
+    sleepq = sleepq_bucket_lookup(bucket, sync_obj);
+
+    if (sleepq == NULL) {
+        spinlock_unlock(&bucket->lock);
+        return NULL;
+    }
+
+    return sleepq;
+}
+
+void
+sleepq_release(struct sleepq *sleepq)
+{
+    spinlock_unlock(&sleepq->bucket->lock);
+}
+
+static void
+sleepq_push_free(struct sleepq *sleepq, struct sleepq *free_sleepq)
+{
+    assert(free_sleepq->next_free == NULL);
+    free_sleepq->next_free = sleepq->next_free;
+    sleepq->next_free = free_sleepq;
+}
+
+static struct sleepq *
+sleepq_pop_free(struct sleepq *sleepq)
+{
+    struct sleepq *free_sleepq;
+
+    free_sleepq = sleepq->next_free;
+
+    if (free_sleepq == NULL) {
+        return NULL;
+    }
+
+    sleepq->next_free = free_sleepq->next_free;
+    free_sleepq->next_free = NULL;
+    return free_sleepq;
+}
+
+struct sleepq *
+sleepq_lend(const void *sync_obj, bool condition)
+{
+    struct sleepq_bucket *bucket;
+    struct sleepq *sleepq, *prev;
+
+    assert(sync_obj != NULL);
+
+    sleepq = thread_sleepq_lend();
+    sleepq_assert_init_state(sleepq);
+
+    bucket = sleepq_bucket_get(sync_obj, condition);
+
+    spinlock_lock(&bucket->lock);
+
+    prev = sleepq_bucket_lookup(bucket, sync_obj);
+
+    if (prev == NULL) {
+        sleepq_use(sleepq, sync_obj);
+        sleepq_bucket_add(bucket, sleepq);
+    } else {
+        sleepq_push_free(prev, sleepq);
+        sleepq = prev;
+    }
+
+    return sleepq;
+}
+
+void
+sleepq_return(struct sleepq *sleepq)
+{
+    struct sleepq_bucket *bucket;
+    struct sleepq *free_sleepq;
+
+    assert(sleepq_in_use(sleepq));
+
+    bucket = sleepq->bucket;
+    free_sleepq = sleepq_pop_free(sleepq);
+
+    if (free_sleepq == NULL) {
+        sleepq_bucket_remove(bucket, sleepq);
+        sleepq_unuse(sleepq);
+        free_sleepq = sleepq;
+    }
+
+    spinlock_unlock(&bucket->lock);
+
+    sleepq_assert_init_state(free_sleepq);
+    thread_sleepq_return(free_sleepq);
+}
+
+static void
+sleepq_add_waiter(struct sleepq *sleepq, struct sleepq_waiter *waiter)
+{
+    list_insert_head(&sleepq->waiters, &waiter->node);
+}
+
+static void
+sleepq_remove_waiter(struct sleepq *sleepq, struct sleepq_waiter *waiter)
+{
+    (void)sleepq;
+    list_remove(&waiter->node);
+}
+
+bool
+sleepq_empty(const struct sleepq *sleepq)
+{
+    return list_empty(&sleepq->waiters);
+}
+
+void
+sleepq_wait(struct sleepq *sleepq, const char *wchan)
+{
+    struct sleepq_waiter waiter;
+    struct thread *thread;
+
+    thread = thread_self();
+    sleepq_waiter_init(&waiter, thread);
+    sleepq_add_waiter(sleepq, &waiter);
+
+    thread_sleep(&sleepq->bucket->lock, sleepq->sync_obj, wchan);
+
+    sleepq_remove_waiter(sleepq, &waiter);
+}
+
+void
+sleepq_signal(struct sleepq *sleepq)
+{
+    struct sleepq_waiter *waiter;
+
+    if (sleepq_empty(sleepq)) {
+        return;
+    }
+
+    waiter = list_last_entry(&sleepq->waiters, struct sleepq_waiter, node);
+    sleepq_waiter_wakeup(waiter);
+}
diff --git a/kern/sleepq.h b/kern/sleepq.h
new file mode 100644
index 00000000..1ba4a319
--- /dev/null
+++ b/kern/sleepq.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Generic sleep queues.
+ *
+ * Sleep queues are used to build sleeping synchronization primitives
+ * such as mutexes and condition variables.
+ *
+ * Although the sleep queues are mostly generic, this implementation
+ * relies on knowing whether a synchronization object is a condition
+ * variable or not, because waiting on a condition variable unlocks
+ * the associated mutex, at which point two sleep queues are locked.
+ * Handling condition variable sleep queues slightly differently
+ * allows preventing deadlocks while keeping overall complexity low.
+ *
+ * In addition, despite being used to implement condition variables,
+ * this implementation doesn't provide a broadcast call. The rationale
+ * is to force users to implement "chained waking" in order to avoid
+ * the thundering herd effect.
+ */
+
+#ifndef _KERN_SLEEPQ_H
+#define _KERN_SLEEPQ_H
+
+#include <stdbool.h>
+
+struct sleepq;
+
+/*
+ * Initialize the sleepq module.
+ */
+void sleepq_setup(void);
+
+/*
+ * Create/destroy a sleep queue.
+ */
+struct sleepq * sleepq_create(void);
+void sleepq_destroy(struct sleepq *sleepq);
+
+/*
+ * Acquire/release a sleep queue.
+ *
+ * Acquiring a sleep queue serializes all access and disables preemption.
+ *
+ * The condition argument must be true if the synchronization object
+ * is a condition variable.
+ */
+struct sleepq * sleepq_acquire(const void *sync_obj, bool condition);
+void sleepq_release(struct sleepq *sleepq);
+
+/*
+ * Lend/return a sleep queue.
+ *
+ * A thread lends its private sleep queue to the sleepq module in
+ * order to prepare its sleep. The sleep queue obtained on lending
+ * is either the thread's queue, or an already existing queue for
+ * this synchronization object if another thread is waiting.
+ *
+ * When multiple threads are waiting on the same queue, the extra
+ * queues lent are kept in an internal free list, used when threads
+ * are awaken to return a queue to them.
+ *
+ * Note that the sleep queue returned may not be the one lent.
+ *
+ * The sleep queue obtained when lending is automatically acquired.
+ *
+ * The condition argument must be true if the synchronization object
+ * is a condition variable.
+ */
+struct sleepq * sleepq_lend(const void *sync_obj, bool condition);
+void sleepq_return(struct sleepq *sleepq);
+
+/*
+ * Return true if the given sleep queue has no waiters.
+ *
+ * The sleep queue must be acquired when calling this function.
+ */
+bool sleepq_empty(const struct sleepq *sleepq);
+
+/*
+ * Wait for a wake up on the given sleep queue.
+ *
+ * The sleep queue must be lent when calling this function. It is
+ * released and later reacquired before returning from this function.
+ *
+ * The calling thread is considered a waiter as long as it didn't
+ * reacquire the sleep queue. This means that signalling a sleep queue
+ * has no visible effect on the number of waiters until the queue is
+ * released, e.g. if a single thread is waiting and another signals
+ * the queue, the queue is not immediately considered empty.
+ *
+ * Threads are queued in FIFO order.
+ */
+void sleepq_wait(struct sleepq *sleepq, const char *wchan);
+
+/*
+ * Wake up a thread waiting on the given sleep queue, if any.
+ *
+ * The sleep queue must be acquired when calling this function.
+ *
+ * Since a sleep queue must be lent (and in turn is automatically
+ * acquired) when waiting, and acquired in order to signal it,
+ * wake-ups are serialized and cannot be missed.
+ *
+ * Threads are queued in FIFO order, which means signalling a sleep
+ * queue multiple times always awakens the same thread, regardless
+ * of new waiters, as long as that first thread didn't reacquire the
+ * sleep queue.
+ */
+void sleepq_signal(struct sleepq *sleepq);
+
+#endif /* _KERN_SLEEPQ_H */
diff --git a/kern/task.c b/kern/task.c
index afc2407a..c7caeb48 100644
--- a/kern/task.c
+++ b/kern/task.c
@@ -143,15 +143,22 @@ task_info(struct task *task)
 
     printk("task: name: %s, threads:\n", task->name);
 
+    /*
+     * Don't grab any lock when accessing threads, so that the function
+     * can be used to debug in the middle of most critical sections.
+     * Threads are only destroyed after being removed from their task
+     * so holding the task lock is enough to guarantee existence.
+     */
     list_for_each_entry(&task->threads, thread, task_node) {
-        printk("task: " TASK_INFO_ADDR_FMT " %c %8s:" TASK_INFO_ADDR_FMT
-               " %.2s:%02hu %s\n",
+        printk(TASK_INFO_ADDR_FMT " %c %8s:" TASK_INFO_ADDR_FMT
+               " %.2s:%02hu %02u %s\n",
                (unsigned long)thread,
                thread_state_to_chr(thread),
                thread_wchan_desc(thread),
                (unsigned long)thread_wchan_addr(thread),
-               thread_sched_class_to_str(thread_sched_class(thread)),
-               thread_priority(thread),
+               thread_sched_class_to_str(thread_user_sched_class(thread)),
+               thread_user_priority(thread),
+               thread_real_global_priority(thread),
                thread->name);
     }
 
diff --git a/kern/thread.c b/kern/thread.c
index 8d7b3a75..14070d92 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -81,6 +81,7 @@
  * weights in a smoother way than a raw scaling).
  */
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
 
@@ -98,11 +99,13 @@
 #include <kern/panic.h>
 #include <kern/param.h>
 #include <kern/percpu.h>
+#include <kern/sleepq.h>
 #include <kern/spinlock.h>
 #include <kern/sprintf.h>
 #include <kern/sref.h>
 #include <kern/task.h>
 #include <kern/thread.h>
+#include <kern/turnstile.h>
 #include <kern/work.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
@@ -259,13 +262,13 @@ struct thread_runq {
  * Operations of a scheduling class.
  */
 struct thread_sched_ops {
-    void (*init_sched)(struct thread *thread, unsigned short priority);
     struct thread_runq * (*select_runq)(struct thread *thread);
     void (*add)(struct thread_runq *runq, struct thread *thread);
     void (*remove)(struct thread_runq *runq, struct thread *thread);
     void (*put_prev)(struct thread_runq *runq, struct thread *thread);
     struct thread * (*get_next)(struct thread_runq *runq);
-    void (*set_priority)(struct thread *thread, unsigned short priority);
+    void (*reset_priority)(struct thread *thread, unsigned short priority);
+    void (*update_priority)(struct thread *thread, unsigned short priority);
     unsigned int (*get_global_priority)(unsigned short priority);
     void (*set_next)(struct thread_runq *runq, struct thread *thread);
     void (*tick)(struct thread_runq *runq, struct thread *thread);
@@ -341,6 +344,13 @@ struct thread_zombie {
     struct thread *thread;
 };
 
+static unsigned char
+thread_policy_to_class(unsigned char policy)
+{
+    assert(policy < ARRAY_SIZE(thread_policy_table));
+    return thread_policy_table[policy];
+}
+
 static void
 thread_set_wchan(struct thread *thread, const void *wchan_addr,
                  const char *wchan_desc)
@@ -359,15 +369,24 @@ thread_clear_wchan(struct thread *thread)
 }
 
 static const struct thread_sched_ops *
-thread_get_sched_ops(const struct thread *thread)
+thread_get_sched_ops(unsigned char sched_class)
 {
-    unsigned char sched_class;
-
-    sched_class = thread_sched_class(thread);
     assert(sched_class < ARRAY_SIZE(thread_sched_ops));
     return &thread_sched_ops[sched_class];
 }
 
+static const struct thread_sched_ops *
+thread_get_user_sched_ops(const struct thread *thread)
+{
+    return thread_get_sched_ops(thread_user_sched_class(thread));
+}
+
+static const struct thread_sched_ops *
+thread_get_real_sched_ops(const struct thread *thread)
+{
+    return thread_get_sched_ops(thread_real_sched_class(thread));
+}
+
 static void __init
 thread_runq_init_rt(struct thread_runq *runq)
 {
@@ -457,8 +476,9 @@ thread_runq_add(struct thread_runq *runq, struct thread *thread)
 
     assert(!cpu_intr_enabled());
     spinlock_assert_locked(&runq->lock);
+    assert(!thread->in_runq);
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
     ops->add(runq, thread);
 
     if (runq->nr_threads == 0) {
@@ -467,11 +487,13 @@ thread_runq_add(struct thread_runq *runq, struct thread *thread)
 
     runq->nr_threads++;
 
-    if (thread_sched_class(thread) < thread_sched_class(runq->current)) {
+    if (thread_real_sched_class(thread)
+        < thread_real_sched_class(runq->current)) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 
     thread->runq = runq;
+    thread->in_runq = true;
 }
 
 static void
@@ -481,6 +503,7 @@ thread_runq_remove(struct thread_runq *runq, struct thread *thread)
 
     assert(!cpu_intr_enabled());
     spinlock_assert_locked(&runq->lock);
+    assert(thread->in_runq);
 
     runq->nr_threads--;
 
@@ -488,8 +511,10 @@ thread_runq_remove(struct thread_runq *runq, struct thread *thread)
         cpumap_set_atomic(&thread_idle_runqs, thread_runq_cpu(runq));
     }
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
     ops->remove(runq, thread);
+
+    thread->in_runq = false;
 }
 
 static void
@@ -500,7 +525,7 @@ thread_runq_put_prev(struct thread_runq *runq, struct thread *thread)
     assert(!cpu_intr_enabled());
     spinlock_assert_locked(&runq->lock);
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->put_prev != NULL) {
         ops->put_prev(runq, thread);
@@ -534,7 +559,7 @@ thread_runq_set_next(struct thread_runq *runq, struct thread *thread)
 {
     const struct thread_sched_ops *ops;
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->set_next != NULL) {
         ops->set_next(runq, thread);
@@ -656,13 +681,6 @@ thread_runq_double_lock(struct thread_runq *a, struct thread_runq *b)
     }
 }
 
-static void
-thread_sched_rt_init_sched(struct thread *thread, unsigned short priority)
-{
-    assert(priority <= THREAD_SCHED_RT_PRIO_MAX);
-    thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
-}
-
 static struct thread_runq *
 thread_sched_rt_select_runq(struct thread *thread)
 {
@@ -689,15 +707,17 @@ thread_sched_rt_add(struct thread_runq *runq, struct thread *thread)
     struct list *threads;
 
     rt_runq = &runq->rt_runq;
-    threads = &rt_runq->threads[thread_priority(thread)];
+    threads = &rt_runq->threads[thread_real_priority(thread)];
     list_insert_tail(threads, &thread->rt_data.node);
 
     if (list_singular(threads)) {
-        rt_runq->bitmap |= (1ULL << thread_priority(thread));
+        rt_runq->bitmap |= (1ULL << thread_real_priority(thread));
     }
 
-    if ((thread_sched_class(thread) == thread_sched_class(runq->current))
-        && (thread_priority(thread) > thread_priority(runq->current))) {
+    if ((thread_real_sched_class(thread)
+         == thread_real_sched_class(runq->current))
+        && (thread_real_priority(thread)
+            > thread_real_priority(runq->current))) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 }
@@ -709,11 +729,11 @@ thread_sched_rt_remove(struct thread_runq *runq, struct thread *thread)
     struct list *threads;
 
     rt_runq = &runq->rt_runq;
-    threads = &rt_runq->threads[thread_priority(thread)];
+    threads = &rt_runq->threads[thread_real_priority(thread)];
     list_remove(&thread->rt_data.node);
 
     if (list_empty(threads)) {
-        rt_runq->bitmap &= ~(1ULL << thread_priority(thread));
+        rt_runq->bitmap &= ~(1ULL << thread_real_priority(thread));
     }
 }
 
@@ -745,6 +765,13 @@ thread_sched_rt_get_next(struct thread_runq *runq)
     return thread;
 }
 
+static void
+thread_sched_rt_reset_priority(struct thread *thread, unsigned short priority)
+{
+    assert(priority <= THREAD_SCHED_RT_PRIO_MAX);
+    thread->rt_data.time_slice = THREAD_DEFAULT_RR_TIME_SLICE;
+}
+
 static unsigned int
 thread_sched_rt_get_global_priority(unsigned short priority)
 {
@@ -762,7 +789,7 @@ thread_sched_rt_tick(struct thread_runq *runq, struct thread *thread)
 {
     (void)runq;
 
-    if (thread_sched_policy(thread) != THREAD_SCHED_POLICY_RR) {
+    if (thread_real_sched_policy(thread) != THREAD_SCHED_POLICY_RR) {
         return;
     }
 
@@ -782,16 +809,6 @@ thread_sched_fs_prio2weight(unsigned short priority)
     return ((priority + 1) * THREAD_FS_ROUND_SLICE_BASE);
 }
 
-static void
-thread_sched_fs_init_sched(struct thread *thread, unsigned short priority)
-{
-    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
-    thread->fs_data.fs_runq = NULL;
-    thread->fs_data.round = 0;
-    thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
-    thread->fs_data.work = 0;
-}
-
 static struct thread_runq *
 thread_sched_fs_select_runq(struct thread *thread)
 {
@@ -893,7 +910,7 @@ thread_sched_fs_enqueue(struct thread_fs_runq *fs_runq, unsigned long round,
     assert(thread->fs_data.fs_runq == NULL);
     assert(thread->fs_data.work <= thread->fs_data.weight);
 
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     group_weight = group->weight + thread->fs_data.weight;
     total_weight = fs_runq->weight + thread->fs_data.weight;
     node = (group->weight == 0)
@@ -969,7 +986,7 @@ thread_sched_fs_restart(struct thread_runq *runq)
     assert(node != NULL);
     fs_runq->current = list_entry(node, struct thread_fs_group, node);
 
-    if (thread_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) {
+    if (thread_real_sched_class(runq->current) == THREAD_SCHED_CLASS_FS) {
         thread_set_flag(runq->current, THREAD_YIELD);
     }
 }
@@ -1005,7 +1022,7 @@ thread_sched_fs_dequeue(struct thread *thread)
     assert(thread->fs_data.fs_runq != NULL);
 
     fs_runq = thread->fs_data.fs_runq;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
 
     thread->fs_data.fs_runq = NULL;
     list_remove(&thread->fs_data.runq_node);
@@ -1080,7 +1097,7 @@ thread_sched_fs_put_prev(struct thread_runq *runq, struct thread *thread)
     struct thread_fs_group *group;
 
     fs_runq = runq->fs_runq_active;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     list_insert_tail(&group->threads, &thread->fs_data.group_node);
 
     if (thread->fs_data.work >= thread->fs_data.weight) {
@@ -1148,8 +1165,19 @@ thread_sched_fs_get_next(struct thread_runq *runq)
 }
 
 static void
-thread_sched_fs_set_priority(struct thread *thread, unsigned short priority)
+thread_sched_fs_reset_priority(struct thread *thread, unsigned short priority)
 {
+    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
+    thread->fs_data.fs_runq = NULL;
+    thread->fs_data.round = 0;
+    thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
+    thread->fs_data.work = 0;
+}
+
+static void
+thread_sched_fs_update_priority(struct thread *thread, unsigned short priority)
+{
+    assert(priority <= THREAD_SCHED_FS_PRIO_MAX);
     thread->fs_data.weight = thread_sched_fs_prio2weight(priority);
 
     if (thread->fs_data.work >= thread->fs_data.weight) {
@@ -1180,7 +1208,7 @@ thread_sched_fs_tick(struct thread_runq *runq, struct thread *thread)
 
     fs_runq = runq->fs_runq_active;
     fs_runq->work++;
-    group = &fs_runq->group_array[thread_priority(thread)];
+    group = &fs_runq->group_array[thread_real_priority(thread)];
     group->work++;
     thread_set_flag(thread, THREAD_YIELD);
     thread->fs_data.work++;
@@ -1231,7 +1259,8 @@ thread_sched_fs_balance_eligible(struct thread_runq *runq,
 
     if ((nr_threads == 0)
         || ((nr_threads == 1)
-            && (thread_sched_class(runq->current) == THREAD_SCHED_CLASS_FS))) {
+            && (thread_real_sched_class(runq->current)
+                == THREAD_SCHED_CLASS_FS))) {
         return 0;
     }
 
@@ -1521,39 +1550,40 @@ thread_sched_idle_get_global_priority(unsigned short priority)
     return THREAD_SCHED_GLOBAL_PRIO_IDLE;
 }
 
-static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] = {
+static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES]
+    = {
     [THREAD_SCHED_CLASS_RT] = {
-        .init_sched = thread_sched_rt_init_sched,
         .select_runq = thread_sched_rt_select_runq,
         .add = thread_sched_rt_add,
         .remove = thread_sched_rt_remove,
         .put_prev = thread_sched_rt_put_prev,
         .get_next = thread_sched_rt_get_next,
-        .set_priority = NULL,
+        .reset_priority = thread_sched_rt_reset_priority,
+        .update_priority = NULL,
         .get_global_priority = thread_sched_rt_get_global_priority,
         .set_next = thread_sched_rt_set_next,
         .tick = thread_sched_rt_tick,
     },
     [THREAD_SCHED_CLASS_FS] = {
-        .init_sched = thread_sched_fs_init_sched,
         .select_runq = thread_sched_fs_select_runq,
         .add = thread_sched_fs_add,
         .remove = thread_sched_fs_remove,
         .put_prev = thread_sched_fs_put_prev,
         .get_next = thread_sched_fs_get_next,
-        .set_priority = thread_sched_fs_set_priority,
+        .reset_priority = thread_sched_fs_reset_priority,
+        .update_priority = thread_sched_fs_update_priority,
         .get_global_priority = thread_sched_fs_get_global_priority,
         .set_next = thread_sched_fs_set_next,
         .tick = thread_sched_fs_tick,
     },
     [THREAD_SCHED_CLASS_IDLE] = {
-        .init_sched = NULL,
         .select_runq = thread_sched_idle_select_runq,
         .add = thread_sched_idle_add,
         .remove = thread_sched_idle_remove,
         .put_prev = NULL,
         .get_next = thread_sched_idle_get_next,
-        .set_priority = NULL,
+        .reset_priority = NULL,
+        .update_priority = NULL,
         .get_global_priority = thread_sched_idle_get_global_priority,
         .set_next = NULL,
         .tick = NULL,
@@ -1561,30 +1591,95 @@ static const struct thread_sched_ops thread_sched_ops[THREAD_NR_SCHED_CLASSES] =
 };
 
 static void
-thread_set_sched_policy(struct thread *thread, unsigned char sched_policy)
+thread_set_user_sched_policy(struct thread *thread, unsigned char sched_policy)
+{
+    thread->user_sched_data.sched_policy = sched_policy;
+}
+
+static void
+thread_set_user_sched_class(struct thread *thread, unsigned char sched_class)
+{
+    thread->user_sched_data.sched_class = sched_class;
+}
+
+static void
+thread_set_user_priority(struct thread *thread, unsigned short priority)
+{
+    const struct thread_sched_ops *ops;
+
+    ops = thread_get_user_sched_ops(thread);
+
+    thread->user_sched_data.priority = priority;
+    thread->user_sched_data.global_priority
+        = ops->get_global_priority(priority);
+}
+
+static void
+thread_update_user_priority(struct thread *thread, unsigned short priority)
+{
+    thread_set_user_priority(thread, priority);
+}
+
+static void
+thread_set_real_sched_policy(struct thread *thread, unsigned char sched_policy)
+{
+    thread->real_sched_data.sched_policy = sched_policy;
+}
+
+static void
+thread_set_real_sched_class(struct thread *thread, unsigned char sched_class)
 {
-    thread->sched_data.sched_policy = sched_policy;
+    thread->real_sched_data.sched_class = sched_class;
 }
 
 static void
-thread_set_sched_class(struct thread *thread, unsigned char sched_class)
+thread_set_real_priority(struct thread *thread, unsigned short priority)
 {
-    thread->sched_data.sched_class = sched_class;
+    const struct thread_sched_ops *ops;
+
+    ops = thread_get_real_sched_ops(thread);
+
+    thread->real_sched_data.priority = priority;
+    thread->real_sched_data.global_priority
+        = ops->get_global_priority(priority);
+
+    if (ops->reset_priority != NULL) {
+        ops->reset_priority(thread, priority);
+    }
 }
 
 static void
-thread_set_priority(struct thread *thread, unsigned short priority)
+thread_update_real_priority(struct thread *thread, unsigned short priority)
 {
     const struct thread_sched_ops *ops;
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
+
+    thread->real_sched_data.priority = priority;
+    thread->real_sched_data.global_priority
+        = ops->get_global_priority(priority);
 
-    if (ops->set_priority != NULL) {
-        ops->set_priority(thread, priority);
+    if (ops->update_priority != NULL) {
+        ops->update_priority(thread, priority);
     }
+}
 
-    thread->sched_data.priority = priority;
-    thread->sched_data.global_priority = ops->get_global_priority(priority);
+static void
+thread_reset_real_priority(struct thread *thread)
+{
+    const struct thread_sched_ops *ops;
+    struct thread_sched_data *user, *real;
+
+    user = &thread->user_sched_data;
+    real = &thread->real_sched_data;
+    *real = *user;
+    thread->boosted = false;
+
+    ops = thread_get_user_sched_ops(thread);
+
+    if (ops->reset_priority != NULL) {
+        ops->reset_priority(thread, real->priority);
+    }
 }
 
 static void __init
@@ -1600,9 +1695,10 @@ thread_bootstrap_common(unsigned int cpu)
     booter->flags = 0;
     booter->preempt = 1;
     cpumap_fill(&booter->cpumap);
-    thread_set_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
-    thread_set_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
-    thread_set_priority(booter, 0);
+    thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
+    thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
+    thread_set_user_priority(booter, 0);
+    thread_reset_real_priority(booter);
     memset(booter->tsd, 0, sizeof(booter->tsd));
     booter->task = kernel_task;
     thread_runq_init(percpu_ptr(thread_runq, cpu), cpu, booter);
@@ -1616,8 +1712,8 @@ thread_bootstrap(void)
 
     thread_fs_highest_round = THREAD_FS_INITIAL_ROUND;
 
-    thread_bootstrap_common(0);
     tcb_set_current(&thread_booters[0].tcb);
+    thread_bootstrap_common(0);
 }
 
 void __init
@@ -1673,23 +1769,9 @@ thread_destroy_tsd(struct thread *thread)
     }
 }
 
-static void
-thread_init_sched(struct thread *thread, unsigned short priority)
-{
-    const struct thread_sched_ops *ops;
-
-    ops = thread_get_sched_ops(thread);
-
-    if (ops->init_sched != NULL) {
-        ops->init_sched(thread, priority);
-    }
-
-    thread->sched_data.priority = priority;
-    thread->sched_data.global_priority = ops->get_global_priority(priority);
-}
-
 static int
-thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
+thread_init(struct thread *thread, void *stack,
+            const struct thread_attr *attr,
             void (*fn)(void *), void *arg)
 {
     struct thread *caller;
@@ -1706,15 +1788,34 @@ thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
     thread->nr_refs = 1;
     thread->flags = 0;
     thread->runq = NULL;
+    thread->in_runq = false;
     thread_set_wchan(thread, thread, "init");
     thread->state = THREAD_SLEEPING;
+    thread->priv_sleepq = sleepq_create();
+
+    if (thread->priv_sleepq == NULL) {
+        error = ERROR_NOMEM;
+        goto error_sleepq;
+    }
+
+    thread->priv_turnstile = turnstile_create();
+
+    if (thread->priv_turnstile == NULL) {
+        error = ERROR_NOMEM;
+        goto error_turnstile;
+    }
+
+    turnstile_td_init(&thread->turnstile_td);
+    thread->last_cond = NULL;
+    thread->propagate_priority = false;
     thread->preempt = THREAD_SUSPEND_PREEMPT_LEVEL;
     thread->pinned = 0;
     thread->llsync_read = 0;
     cpumap_copy(&thread->cpumap, cpumap);
-    thread_set_sched_policy(thread, attr->policy);
-    thread_set_sched_class(thread, thread_policy_table[attr->policy]);
-    thread_init_sched(thread, attr->priority);
+    thread_set_user_sched_policy(thread, attr->policy);
+    thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
+    thread_set_user_priority(thread, attr->priority);
+    thread_reset_real_priority(thread);
     memset(thread->tsd, 0, sizeof(thread->tsd));
     mutex_init(&thread->join_lock);
     condition_init(&thread->join_cond);
@@ -1732,15 +1833,19 @@ thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
     error = tcb_init(&thread->tcb, stack, thread_main);
 
     if (error) {
-        goto error_tsd;
+        goto error_tcb;
     }
 
     task_add_thread(task, thread);
 
     return 0;
 
-error_tsd:
+error_tcb:
     thread_destroy_tsd(thread);
+    turnstile_destroy(thread->priv_turnstile);
+error_turnstile:
+    sleepq_destroy(thread->priv_sleepq);
+error_sleepq:
     return error;
 }
 
@@ -1782,8 +1887,12 @@ thread_destroy(struct thread *thread)
         thread_unlock_runq(runq, flags);
     } while (state != THREAD_DEAD);
 
-    thread_destroy_tsd(thread);
+    /* See task_info() */
     task_remove_thread(thread->task, thread);
+
+    thread_destroy_tsd(thread);
+    turnstile_destroy(thread->priv_turnstile);
+    sleepq_destroy(thread->priv_sleepq);
     kmem_cache_free(&thread_stack_cache, thread->stack);
     kmem_cache_free(&thread_cache, thread);
 }
@@ -2208,7 +2317,7 @@ thread_wakeup(struct thread *thread)
         thread->state = THREAD_RUNNING;
     } else {
         /*
-         * If another wakeup was attempted right before this one, the thread
+         * If another wake-up was attempted right before this one, the thread
          * may currently be pushed on a remote run queue, and the run queue
          * being locked here is actually the previous one. The run queue
          * pointer may be modified concurrently, now being protected by the
@@ -2233,7 +2342,7 @@ thread_wakeup(struct thread *thread)
     cpu_intr_save(&flags);
 
     if (!thread->pinned) {
-        runq = thread_sched_ops[thread_sched_class(thread)].select_runq(thread);
+        runq = thread_get_real_sched_ops(thread)->select_runq(thread);
     } else {
         runq = thread->runq;
         spinlock_lock(&runq->lock);
@@ -2325,7 +2434,7 @@ thread_tick_intr(void)
         thread_balance_idle_tick(runq);
     }
 
-    ops = thread_get_sched_ops(thread);
+    ops = thread_get_real_sched_ops(thread);
 
     if (ops->tick != NULL) {
         ops->tick(runq, thread);
@@ -2334,6 +2443,7 @@ thread_tick_intr(void)
     spinlock_unlock(&runq->lock);
 }
 
+/* TODO Move outside */
 char
 thread_state_to_chr(const struct thread *thread)
 {
@@ -2349,6 +2459,7 @@ thread_state_to_chr(const struct thread *thread)
     }
 }
 
+/* TODO Move outside */
 const char *
 thread_sched_class_to_str(unsigned char sched_class)
 {
@@ -2369,17 +2480,23 @@ thread_setscheduler(struct thread *thread, unsigned char policy,
                     unsigned short priority)
 {
     struct thread_runq *runq;
+    struct turnstile_td *td;
     unsigned long flags;
-    bool current;
+    bool requeue, current, update;
+
+    td = thread_turnstile_td(thread);
 
+    turnstile_td_lock(td);
     runq = thread_lock_runq(thread, &flags);
 
-    if ((thread_sched_policy(thread) == policy)
-        && (thread_priority(thread) == priority)) {
+    if ((thread_user_sched_policy(thread) == policy)
+        && (thread_user_priority(thread) == priority)) {
         goto out;
     }
 
-    if (thread->state != THREAD_RUNNING) {
+    requeue = thread->in_runq;
+
+    if (!requeue) {
         current = false;
     } else {
         if (thread != runq->current) {
@@ -2392,16 +2509,32 @@ thread_setscheduler(struct thread *thread, unsigned char policy,
         thread_runq_remove(runq, thread);
     }
 
-    if (thread_sched_policy(thread) == policy) {
-        thread_set_priority(thread, priority);
+    if (thread_user_sched_policy(thread) == policy) {
+        thread_update_user_priority(thread, priority);
+        update = true;
     } else {
-        thread_set_sched_policy(thread, policy);
-        assert(policy < ARRAY_SIZE(thread_policy_table));
-        thread_set_sched_class(thread, thread_policy_table[policy]);
-        thread_init_sched(thread, priority);
+        thread_set_user_sched_policy(thread, policy);
+        thread_set_user_sched_class(thread, thread_policy_to_class(policy));
+        thread_set_user_priority(thread, priority);
+        update = false;
     }
 
-    if (thread->state == THREAD_RUNNING) {
+    if (thread->boosted) {
+        if (thread_user_global_priority(thread)
+            >= thread_real_global_priority(thread)) {
+            thread_reset_real_priority(thread);
+        }
+    } else {
+        if (update) {
+            thread_update_real_priority(thread, priority);
+        } else {
+            thread_set_real_sched_policy(thread, policy);
+            thread_set_real_sched_class(thread, thread_policy_to_class(policy));
+            thread_set_real_priority(thread, priority);
+        }
+    }
+
+    if (requeue) {
         thread_runq_add(runq, thread);
 
         if (current) {
@@ -2411,6 +2544,96 @@ thread_setscheduler(struct thread *thread, unsigned char policy,
 
 out:
     thread_unlock_runq(runq, flags);
+    turnstile_td_unlock(td);
+
+    turnstile_td_propagate_priority(td);
+}
+
+void
+thread_pi_setscheduler(struct thread *thread, unsigned char policy,
+                       unsigned short priority)
+{
+    const struct thread_sched_ops *ops;
+    struct thread_runq *runq;
+    struct turnstile_td *td;
+    unsigned int global_priority;
+    unsigned long flags;
+    bool requeue, current;
+
+    td = thread_turnstile_td(thread);
+    turnstile_td_assert_lock(td);
+
+    ops = thread_get_sched_ops(thread_policy_to_class(policy));
+    global_priority = ops->get_global_priority(priority);
+
+    runq = thread_lock_runq(thread, &flags);
+
+    if ((thread_real_sched_policy(thread) == policy)
+        && (thread_real_priority(thread) == priority)) {
+        goto out;
+    }
+
+    requeue = thread->in_runq;
+
+    if (!requeue) {
+        current = false;
+    } else {
+        if (thread != runq->current) {
+            current = false;
+        } else {
+            thread_runq_put_prev(runq, thread);
+            current = true;
+        }
+
+        thread_runq_remove(runq, thread);
+    }
+
+    if (global_priority <= thread_user_global_priority(thread)) {
+        thread_reset_real_priority(thread);
+    } else {
+        if (thread_real_sched_policy(thread) == policy) {
+            thread_update_real_priority(thread, priority);
+        } else {
+            thread_set_real_sched_policy(thread, policy);
+            thread_set_real_sched_class(thread, thread_policy_to_class(policy));
+            thread_set_real_priority(thread, priority);
+        }
+
+        thread->boosted = true;
+    }
+
+    if (requeue) {
+        thread_runq_add(runq, thread);
+
+        if (current) {
+            thread_runq_set_next(runq, thread);
+        }
+    }
+
+out:
+    thread_unlock_runq(runq, flags);
+}
+
+void
+thread_propagate_priority(void)
+{
+    struct thread *thread;
+
+    /*
+     * Although it's possible to propagate priority with preemption
+     * disabled, the operation can be too expensive to allow it.
+     */
+    if (!thread_preempt_enabled()) {
+        thread_set_priority_propagation_needed();
+        return;
+    }
+
+    thread = thread_self();
+
+    /* Clear before propagation to avoid infinite recursion */
+    thread->propagate_priority = false;
+
+    turnstile_td_propagate_priority(thread_turnstile_td(thread));
 }
 
 void
diff --git a/kern/thread.h b/kern/thread.h
index 21be2ab7..60c330b9 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -33,30 +33,24 @@
 #ifndef _KERN_THREAD_H
 #define _KERN_THREAD_H
 
+#include <stdbool.h>
+#include <stddef.h>
+
 #include <kern/assert.h>
+#include <kern/condition.h>
 #include <kern/cpumap.h>
 #include <kern/macros.h>
+#include <kern/spinlock_types.h>
+#include <kern/turnstile_types.h>
 #include <machine/atomic.h>
 #include <machine/tcb.h>
 
 /*
- * Forward declaration
- */
-struct spinlock;
-
-/*
  * Thread structure.
  */
 struct thread;
 
 /*
- * Thread name buffer size.
- */
-#define THREAD_NAME_SIZE 32
-
-/*
- * Common scheduling data.
- *
  * The global priority of a thread is meant to be compared against
  * another global priority to determine which thread has higher priority.
  */
@@ -67,6 +61,11 @@ struct thread_sched_data {
     unsigned int global_priority;
 };
 
+/*
+ * Thread name buffer size.
+ */
+#define THREAD_NAME_SIZE 32
+
 #include <kern/thread_i.h>
 
 #define THREAD_KERNEL_PREFIX PACKAGE "_"
@@ -199,8 +198,9 @@ void thread_join(struct thread *thread);
  * Make the current thread sleep while waiting for an event.
  *
  * The interlock is used to synchronize the thread state with respect to
- * wakeups, i.e. a wakeup request sent by another thread will not be missed
+ * wake-ups, i.e. a wake-up request sent by another thread cannot be missed
  * if that thread is holding the interlock.
+ *
  * As a special exception, threads that use preemption as a synchronization
  * mechanism can ommit the interlock and pass a NULL pointer instead.
  * In any case, the preemption nesting level must strictly be one when calling
@@ -210,9 +210,6 @@ void thread_join(struct thread *thread);
  * address should refer to a relevant synchronization object, normally
  * containing the interlock, but not necessarily.
  *
- * This is a low level thread control primitive that should only be called by
- * higher thread synchronization functions.
- *
  * Implies a memory barrier.
  */
 void thread_sleep(struct spinlock *interlock, const void *wchan_addr,
@@ -222,9 +219,6 @@ void thread_sleep(struct spinlock *interlock, const void *wchan_addr,
  * Schedule a thread for execution on a processor.
  *
  * No action is performed if the target thread is already in the running state.
- *
- * This is a low level thread control primitive that should only be called by
- * higher thread synchronization functions.
  */
 void thread_wakeup(struct thread *thread);
 
@@ -261,6 +255,15 @@ void thread_tick_intr(void);
 void thread_setscheduler(struct thread *thread, unsigned char policy,
                          unsigned short priority);
 
+/*
+ * Variant used for priority inheritance.
+ *
+ * The caller must hold the turnstile thread data lock and no turnstile
+ * locks when calling this function.
+ */
+void thread_pi_setscheduler(struct thread *thread, unsigned char policy,
+                            unsigned short priority);
+
 static inline void
 thread_ref(struct thread *thread)
 {
@@ -301,33 +304,72 @@ thread_wchan_desc(const struct thread *thread)
 char thread_state_to_chr(const struct thread *thread);
 
 static inline const struct thread_sched_data *
-thread_get_sched_data(const struct thread *thread)
+thread_get_user_sched_data(const struct thread *thread)
 {
-    return &thread->sched_data;
+    return &thread->user_sched_data;
 }
 
+static inline const struct thread_sched_data *
+thread_get_real_sched_data(const struct thread *thread)
+{
+    return &thread->real_sched_data;
+}
+
+/*
+ * If the caller requires the scheduling data to be stable, it
+ * must lock one of the following objects :
+ *  - the containing run queue
+ *  - the per-thread turnstile data (turnstile_td)
+ *
+ * Both are locked when scheduling data are updated.
+ */
+
 static inline unsigned char
-thread_sched_policy(const struct thread *thread)
+thread_user_sched_policy(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->sched_policy;
+    return thread_get_user_sched_data(thread)->sched_policy;
 }
 
 static inline unsigned char
-thread_sched_class(const struct thread *thread)
+thread_user_sched_class(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->sched_class;
+    return thread_get_user_sched_data(thread)->sched_class;
 }
 
 static inline unsigned short
-thread_priority(const struct thread *thread)
+thread_user_priority(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->priority;
+    return thread_get_user_sched_data(thread)->priority;
 }
 
 static inline unsigned int
-thread_global_priority(const struct thread *thread)
+thread_user_global_priority(const struct thread *thread)
+{
+    return thread_get_user_sched_data(thread)->global_priority;
+}
+
+static inline unsigned char
+thread_real_sched_policy(const struct thread *thread)
+{
+    return thread_get_real_sched_data(thread)->sched_policy;
+}
+
+static inline unsigned char
+thread_real_sched_class(const struct thread *thread)
+{
+    return thread_get_real_sched_data(thread)->sched_class;
+}
+
+static inline unsigned short
+thread_real_priority(const struct thread *thread)
 {
-    return thread_get_sched_data(thread)->global_priority;
+    return thread_get_real_sched_data(thread)->priority;
+}
+
+static inline unsigned int
+thread_real_global_priority(const struct thread *thread)
+{
+    return thread_get_real_sched_data(thread)->global_priority;
 }
 
 /*
@@ -367,6 +409,118 @@ thread_schedule(void)
 }
 
 /*
+ * Sleep queue lending functions.
+ */
+
+static inline struct sleepq *
+thread_sleepq_lend(void)
+{
+    struct sleepq *sleepq;
+
+    sleepq = thread_self()->priv_sleepq;
+    assert(sleepq != NULL);
+    thread_self()->priv_sleepq = NULL;
+    return sleepq;
+}
+
+static inline void
+thread_sleepq_return(struct sleepq *sleepq)
+{
+    assert(sleepq != NULL);
+    assert(thread_self()->priv_sleepq == NULL);
+    thread_self()->priv_sleepq = sleepq;
+}
+
+/*
+ * Condition variable related functions.
+ */
+
+static inline void
+thread_set_last_cond(struct condition *last_cond)
+{
+    struct thread *thread;
+
+    thread = thread_self();
+    assert(thread->last_cond == NULL);
+    thread->last_cond = last_cond;
+}
+
+static inline struct condition *
+thread_pull_last_cond(void)
+{
+    struct condition *last_cond;
+    struct thread *thread;
+
+    thread = thread_self();
+    last_cond = thread->last_cond;
+
+    if (last_cond != NULL) {
+        thread->last_cond = NULL;
+    }
+
+    return last_cond;
+}
+
+static inline void
+thread_wakeup_last_cond(void)
+{
+    struct condition *last_cond;
+
+    last_cond = thread_pull_last_cond();
+
+    if (last_cond != NULL) {
+        condition_wakeup(last_cond);
+    }
+}
+
+/*
+ * Turnstile lending functions.
+ */
+
+static inline struct turnstile *
+thread_turnstile_lend(void)
+{
+    struct turnstile *turnstile;
+
+    turnstile = thread_self()->priv_turnstile;
+    assert(turnstile != NULL);
+    thread_self()->priv_turnstile = NULL;
+    return turnstile;
+}
+
+static inline void
+thread_turnstile_return(struct turnstile *turnstile)
+{
+    assert(turnstile != NULL);
+    assert(thread_self()->priv_turnstile == NULL);
+    thread_self()->priv_turnstile = turnstile;
+}
+
+static inline struct turnstile_td *
+thread_turnstile_td(struct thread *thread)
+{
+    return &thread->turnstile_td;
+}
+
+/*
+ * Priority propagation functions.
+ */
+
+static inline bool
+thread_priority_propagation_needed(void)
+{
+    return thread_self()->propagate_priority;
+}
+
+static inline void
+thread_set_priority_propagation_needed(void)
+{
+    thread_self()->propagate_priority = true;
+}
+
+void thread_propagate_priority(void);
+
+/*
  * Migration control functions.
  *
  * Functions that change the migration state are implicit compiler barriers.
@@ -421,6 +575,10 @@ thread_preempt_enable_no_resched(void)
     thread = thread_self();
     assert(thread->preempt != 0);
     thread->preempt--;
+
+    if (thread_preempt_enabled() && thread_priority_propagation_needed()) {
+        thread_propagate_priority();
+    }
 }
 
 static inline void
diff --git a/kern/thread_i.h b/kern/thread_i.h
index a062773d..00d40ca6 100644
--- a/kern/thread_i.h
+++ b/kern/thread_i.h
@@ -18,15 +18,23 @@
 #ifndef _KERN_THREAD_I_H
 #define _KERN_THREAD_I_H
 
+#include <stdbool.h>
+
 #include <kern/condition_types.h>
 #include <kern/cpumap.h>
 #include <kern/list_types.h>
 #include <kern/macros.h>
 #include <kern/mutex_types.h>
 #include <kern/param.h>
+#include <kern/turnstile_types.h>
 #include <machine/atomic.h>
 #include <machine/tcb.h>
 
+/*
+ * Forward declarations.
+ */
+struct sleepq;
+
 struct thread_runq;
 struct thread_fs_runq;
 
@@ -86,12 +94,38 @@ struct thread {
     /* Flags must be changed atomically */
     unsigned long flags;
 
-    /* Sleep/wakeup synchronization members */
+    /* Sleep/wake-up synchronization members */
     struct thread_runq *runq;
+    bool in_runq;
     const void *wchan_addr;
     const char *wchan_desc;
     unsigned short state;
 
+    /* Sleep queue available for lending */
+    struct sleepq *priv_sleepq;
+
+    /* Turnstile available for lending */
+    struct turnstile *priv_turnstile;
+
+    /*
+     * When a thread wakes up after waiting for a condition variable,
+     * it sets this variable so that other waiters are only awaken
+     * after the associated mutex has actually been released.
+     *
+     * This member is thread-local.
+     */
+    struct condition *last_cond;
+
+    /* Per-thread turnstile data */
+    struct turnstile_td turnstile_td;
+
+    /*
+     * True if priority must be propagated when preemption is reenabled
+     *
+     * This member is thread-local.
+     */
+    bool propagate_priority;
+
     /* Thread-local members */
     unsigned short preempt;
     unsigned short pinned;
@@ -100,8 +134,21 @@ struct thread {
     /* Processors on which this thread is allowed to run */
     struct cpumap cpumap;
 
-    /* Scheduling data */
-    struct thread_sched_data sched_data;
+    /*
+     * Scheduling data.
+     */
+    struct thread_sched_data user_sched_data; /* User-provided */
+    struct thread_sched_data real_sched_data; /* Computed from
+                                                 priority propagation */
+
+    /*
+     * True if the real scheduling data are not the user scheduling data.
+     *
+     * Note that it doesn't provide any information about priority inheritance.
+     * A thread may be part of a priority inheritance chain without its
+     * priority being boosted.
+     */
+    bool boosted;
 
     /* Class specific scheduling data */
     union {
diff --git a/kern/turnstile.c b/kern/turnstile.c
new file mode 100644
index 00000000..43629376
--- /dev/null
+++ b/kern/turnstile.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This implementation is based on "Solaris(tm) Internals: Solaris 10 and
+ * OpenSolaris Kernel Architecture, Second Edition" by Richard McDougall
+ * and Jim Mauro. See "Part Six: Platform Specifics, Chapter 17: Locking
+ * and Synchronization, Section 7 Turnstiles and Priority Inheritance".
+ *
+ * The differences are outlined below.
+ *
+ * This implementation doesn't support read/write locks, only mutual
+ * exclusion, because ownership doesn't apply well to read/write locks.
+ *
+ * Instead of using an external sleep queue object, this module implements
+ * that functionality itself. The reasons behind this decision are :
+ *  - the use of expensive priority lists used to queue threads, that
+ *    a simpler sleep queue implementation shouldn't use
+ *  - increased coupling with the scheduler
+ *
+ * Locking order : bucket (turnstile) -> turnstile_td -> thread_runq
+ *
+ * This order allows changing the owner of a turnstile without unlocking it
+ * which is important because a turnstile is normally used to synchronize
+ * access to the owner. Unlocking a turnstile would allow the owner to
+ * change and make complex transient states visible. The drawback is that
+ * a thread cannot be requeued atomically when its priority is changed.
+ * That deferred requeue is done during priority propagation.
+ *
+ * TODO Analyse hash parameters.
+ */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <kern/assert.h>
+#include <kern/init.h>
+#include <kern/kmem.h>
+#include <kern/list.h>
+#include <kern/macros.h>
+#include <kern/param.h>
+#include <kern/plist.h>
+#include <kern/spinlock.h>
+#include <kern/thread.h>
+#include <kern/turnstile.h>
+#include <kern/turnstile_types.h>
+
+/*
+ * Locking keys :
+ * (b) bucket
+ */
+struct turnstile_bucket {
+    struct spinlock lock;
+    struct list list;       /* (b) */
+} __aligned(CPU_L1_SIZE);
+
+/*
+ * Adding/removing waiters to/from a turnstile are performed while
+ * holding both the waiter's thread data and the turnstile locks.
+ *
+ * Changing the owner of a turnstile and linking/unlinking the turnstile
+ * into/from the owner's list of owned turnstiles are done atomically,
+ * while holding both the owner's thread data and the turnstile locks.
+ *
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ *
+ * (*) A turnstile is referenced in thread data after/before being
+ * added/removed to/from its bucket. Referencing a turnstile in
+ * thread data requires holding the thread data lock. This implies
+ * that, while holding the thread data lock, if the referenced
+ * turnstile isn't NULL, the bucket pointer is also not NULL and
+ * stable.
+ */
+struct turnstile {
+    struct turnstile_bucket *bucket;        /* (b*)  */
+    struct list node;                       /* (b)   */
+    const void *sync_obj;                   /* (b)   */
+    struct plist waiters;                   /* (b,t) */
+    struct turnstile *next_free;            /* (b)   */
+    struct turnstile_waiter *top_waiter;    /* (b,t) */
+    struct thread *owner;                   /* (b,t) */
+    struct plist_node td_node;              /* (t)   */
+};
+
+/*
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ */
+struct turnstile_waiter {
+    struct plist_node node; /* (b,t) */
+    struct thread *thread;  /* (b,t) */
+    bool awaken;            /* (b)   */
+};
+
+#define TURNSTILE_HTABLE_SIZE 128
+
+#if !ISP2(TURNSTILE_HTABLE_SIZE)
+#error hash table size must be a power of two
+#endif /* !ISP2(TURNSTILE_HTABLE_SIZE) */
+
+#define TURNSTILE_HTABLE_MASK (TURNSTILE_HTABLE_SIZE - 1)
+
+static struct turnstile_bucket turnstile_htable[TURNSTILE_HTABLE_SIZE];
+
+static struct kmem_cache turnstile_cache;
+
+static uintptr_t
+turnstile_hash(const void *addr)
+{
+    return ((uintptr_t)addr >> 8) ^ (uintptr_t)addr;
+}
+
+static void
+turnstile_waiter_init(struct turnstile_waiter *waiter, struct thread *thread)
+{
+    plist_node_init(&waiter->node, thread_real_global_priority(thread));
+    waiter->thread = thread;
+    waiter->awaken = false;
+}
+
+static unsigned int
+turnstile_waiter_priority(const struct turnstile_waiter *waiter)
+{
+    return plist_node_priority(&waiter->node);
+}
+
+static bool
+turnstile_waiter_awaken(const struct turnstile_waiter *waiter)
+{
+    return waiter->awaken;
+}
+
+static void
+turnstile_waiter_set_awaken(struct turnstile_waiter *waiter)
+{
+    waiter->awaken = true;
+}
+
+static void
+turnstile_waiter_clear_awaken(struct turnstile_waiter *waiter)
+{
+    waiter->awaken = false;
+}
+
+static void
+turnstile_waiter_wakeup(struct turnstile_waiter *waiter)
+{
+    if (turnstile_waiter_awaken(waiter)) {
+        return;
+    }
+
+    thread_wakeup(waiter->thread);
+    turnstile_waiter_set_awaken(waiter);
+}
+
+static void
+turnstile_update_top_waiter(struct turnstile *turnstile)
+{
+    if (turnstile_empty(turnstile)) {
+        turnstile->top_waiter = NULL;
+        return;
+    }
+
+    turnstile->top_waiter = plist_last_entry(&turnstile->waiters,
+                                             struct turnstile_waiter, node);
+}
+
+static void
+turnstile_add_waiter(struct turnstile *turnstile,
+                     struct turnstile_waiter *waiter)
+{
+    assert(!turnstile_waiter_awaken(waiter));
+    plist_add(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_remove_waiter(struct turnstile *turnstile,
+                        struct turnstile_waiter *waiter)
+{
+    assert(turnstile_waiter_awaken(waiter));
+    plist_remove(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_waiter_requeue(struct turnstile *turnstile,
+                         struct turnstile_waiter *waiter)
+{
+    unsigned int global_priority;
+
+    global_priority = thread_real_global_priority(waiter->thread);
+    assert(global_priority != plist_node_priority(&waiter->node));
+
+    plist_remove(&turnstile->waiters, &waiter->node);
+    plist_node_set_priority(&waiter->node, global_priority);
+    plist_add(&turnstile->waiters, &waiter->node);
+    turnstile_update_top_waiter(turnstile);
+}
+
+static void
+turnstile_td_set_waiter(struct turnstile_td *td,
+                        struct turnstile_waiter *waiter)
+{
+    td->waiter = waiter;
+}
+
+static struct turnstile_waiter *
+turnstile_td_get_waiter(const struct turnstile_td *td)
+{
+    return td->waiter;
+}
+
+static void
+turnstile_td_update_top_priority(struct turnstile_td *td)
+{
+    struct turnstile_waiter *top_waiter;
+    struct turnstile *top_turnstile;
+
+    if (plist_empty(&td->owned_turnstiles)) {
+        td->top_global_priority = 0;
+        return;
+    }
+
+    top_turnstile = plist_last_entry(&td->owned_turnstiles,
+                                     struct turnstile, td_node);
+    top_waiter = top_turnstile->top_waiter;
+
+    if (top_waiter == NULL) {
+        td->top_global_priority = 0;
+    } else {
+        td->top_global_priority = turnstile_waiter_priority(top_waiter);
+        td->top_sched_policy = thread_real_sched_policy(top_waiter->thread);
+        td->top_priority = thread_real_priority(top_waiter->thread);
+    }
+}
+
+static void
+turnstile_td_own(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    struct turnstile_waiter *top_waiter;
+    unsigned int top_priority;
+
+    assert(turnstile->owner == NULL);
+
+    top_waiter = turnstile->top_waiter;
+    assert(top_waiter != NULL);
+    top_priority = thread_real_global_priority(top_waiter->thread);
+    plist_node_init(&turnstile->td_node, top_priority);
+    plist_add(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile_td_update_top_priority(td);
+
+    turnstile->owner = structof(td, struct thread, turnstile_td);
+}
+
+static void
+turnstile_td_disown(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    assert(turnstile->owner == structof(td, struct thread, turnstile_td));
+
+    assert(!plist_node_unlinked(&turnstile->td_node));
+    plist_remove(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile_td_update_top_priority(td);
+
+    turnstile->owner = NULL;
+}
+
+static void
+turnstile_td_reown(struct turnstile_td *td, struct turnstile *turnstile)
+{
+    assert(turnstile->owner == structof(td, struct thread, turnstile_td));
+    assert(!plist_node_unlinked(&turnstile->td_node));
+    plist_remove(&td->owned_turnstiles, &turnstile->td_node);
+    turnstile->owner = NULL;
+    turnstile_td_own(td, turnstile);
+}
+
+/*
+ * The caller must hold the turnstile thread data lock and no turnstile
+ * locks when calling this function. The thread data are unlocked on return.
+ *
+ * In addition, this function drops a reference on the thread associated
+ * with the given thread data.
+ */
+static void
+turnstile_td_propagate_priority_loop(struct turnstile_td *td)
+{
+    unsigned int user_priority, real_priority, top_priority;
+    struct turnstile_waiter *waiter;
+    struct turnstile *turnstile;
+    struct thread *thread;
+    unsigned short priority;
+    unsigned char policy;
+    int error;
+
+    /*
+     * At the very least, this function must make sure that :
+     *  - the given thread has its intended priority, which is the
+     *    highest among its own and all the waiters in the turnstiles
+     *    it owns, and
+     *  - the thread is at its intended position in the turnstile it's
+     *    waiting on, if any.
+     */
+
+    for (;;) {
+        thread = structof(td, struct thread, turnstile_td);
+        user_priority = thread_user_global_priority(thread);
+        real_priority = thread_real_global_priority(thread);
+        top_priority = td->top_global_priority;
+
+        if (top_priority > user_priority) {
+            policy = td->top_sched_policy;
+            priority = td->top_priority;
+        } else {
+            top_priority = user_priority;
+            policy = thread_user_sched_policy(thread);
+            priority = thread_user_priority(thread);
+        }
+
+        if (top_priority != real_priority) {
+            thread_pi_setscheduler(thread, policy, priority);
+        }
+
+        waiter = turnstile_td_get_waiter(td);
+
+        if ((waiter == NULL)
+            || (top_priority == turnstile_waiter_priority(waiter))) {
+            spinlock_unlock(&td->lock);
+            thread_unref(thread);
+            return;
+        }
+
+        turnstile = turnstile_td_get_turnstile(td);
+        assert(turnstile != NULL);
+
+        error = spinlock_trylock(&turnstile->bucket->lock);
+
+        if (error) {
+            spinlock_unlock(&td->lock);
+            spinlock_lock(&td->lock);
+            continue;
+        }
+
+        /*
+         * This couldn't be done while changing the thread's priority
+         * because of locking restrictions. Do it now.
+         */
+        turnstile_waiter_requeue(turnstile, waiter);
+
+        spinlock_unlock(&td->lock);
+        thread_unref(thread);
+
+        thread = turnstile->owner;
+
+        if (thread == NULL) {
+            break;
+        }
+
+        td = thread_turnstile_td(thread);
+
+        thread_ref(thread);
+        spinlock_lock(&td->lock);
+
+        turnstile_td_reown(td, turnstile);
+
+        spinlock_unlock(&turnstile->bucket->lock);
+    }
+
+    spinlock_unlock(&turnstile->bucket->lock);
+}
+
+void
+turnstile_td_propagate_priority(struct turnstile_td *td)
+{
+    struct thread *thread;
+
+    thread = structof(td, struct thread, turnstile_td);
+
+    thread_ref(thread);
+    spinlock_lock(&td->lock);
+    turnstile_td_propagate_priority_loop(td);
+}
+
+static void
+turnstile_assert_init_state(const struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == NULL);
+    assert(turnstile->sync_obj == NULL);
+    assert(plist_empty(&turnstile->waiters));
+    assert(turnstile->next_free == NULL);
+    assert(turnstile->top_waiter == NULL);
+    assert(turnstile->owner == NULL);
+}
+
+static void
+turnstile_use(struct turnstile *turnstile, const void *sync_obj)
+{
+    assert(turnstile->sync_obj == NULL);
+    turnstile->sync_obj = sync_obj;
+}
+
+static void
+turnstile_unuse(struct turnstile *turnstile)
+{
+    assert(turnstile->sync_obj != NULL);
+    turnstile->sync_obj = NULL;
+}
+
+static bool
+turnstile_in_use(const struct turnstile *turnstile)
+{
+    return turnstile->sync_obj != NULL;
+}
+
+static bool
+turnstile_in_use_by(const struct turnstile *turnstile, const void *sync_obj)
+{
+    return turnstile->sync_obj == sync_obj;
+}
+
+static void
+turnstile_bucket_init(struct turnstile_bucket *bucket)
+{
+    spinlock_init(&bucket->lock);
+    list_init(&bucket->list);
+}
+
+static struct turnstile_bucket *
+turnstile_bucket_get(const void *sync_obj)
+{
+    uintptr_t index;
+
+    index = turnstile_hash(sync_obj) & TURNSTILE_HTABLE_MASK;
+    assert(index < ARRAY_SIZE(turnstile_htable));
+    return &turnstile_htable[index];
+}
+
+static void
+turnstile_bucket_add(struct turnstile_bucket *bucket,
+                     struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == NULL);
+    turnstile->bucket = bucket;
+    list_insert_tail(&bucket->list, &turnstile->node);
+}
+
+static void
+turnstile_bucket_remove(struct turnstile_bucket *bucket,
+                        struct turnstile *turnstile)
+{
+    assert(turnstile->bucket == bucket);
+    turnstile->bucket = NULL;
+    list_remove(&turnstile->node);
+}
+
+static struct turnstile *
+turnstile_bucket_lookup(const struct turnstile_bucket *bucket,
+                        const void *sync_obj)
+{
+    struct turnstile *turnstile;
+
+    list_for_each_entry(&bucket->list, turnstile, node) {
+        if (turnstile_in_use_by(turnstile, sync_obj)) {
+            return turnstile;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+turnstile_ctor(void *ptr)
+{
+    struct turnstile *turnstile;
+
+    turnstile = ptr;
+    turnstile->bucket = NULL;
+    turnstile->sync_obj = NULL;
+    plist_init(&turnstile->waiters);
+    turnstile->next_free = NULL;
+    turnstile->top_waiter = NULL;
+    turnstile->owner = NULL;
+}
+
+void __init
+turnstile_setup(void)
+{
+    unsigned int i;
+
+    for (i = 0; i < ARRAY_SIZE(turnstile_htable); i++) {
+        turnstile_bucket_init(&turnstile_htable[i]);
+    }
+
+    kmem_cache_init(&turnstile_cache, "turnstile", sizeof(struct turnstile),
+                    CPU_L1_SIZE, turnstile_ctor, 0);
+}
+
+struct turnstile *
+turnstile_create(void)
+{
+    struct turnstile *turnstile;
+
+    turnstile = kmem_cache_alloc(&turnstile_cache);
+
+    if (turnstile == NULL) {
+        return NULL;
+    }
+
+    turnstile_assert_init_state(turnstile);
+    return turnstile;
+}
+
+void
+turnstile_destroy(struct turnstile *turnstile)
+{
+    turnstile_assert_init_state(turnstile);
+    kmem_cache_free(&turnstile_cache, turnstile);
+}
+
+struct turnstile *
+turnstile_acquire(const void *sync_obj)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *turnstile;
+
+    assert(sync_obj != NULL);
+
+    bucket = turnstile_bucket_get(sync_obj);
+
+    spinlock_lock(&bucket->lock);
+
+    turnstile = turnstile_bucket_lookup(bucket, sync_obj);
+
+    if (turnstile == NULL) {
+        spinlock_unlock(&bucket->lock);
+        return NULL;
+    }
+
+    return turnstile;
+}
+
+void
+turnstile_release(struct turnstile *turnstile)
+{
+    spinlock_unlock(&turnstile->bucket->lock);
+}
+
+static void
+turnstile_push_free(struct turnstile *turnstile,
+                    struct turnstile *free_turnstile)
+{
+    assert(free_turnstile->next_free == NULL);
+    free_turnstile->next_free = turnstile->next_free;
+    turnstile->next_free = free_turnstile;
+}
+
+static struct turnstile *
+turnstile_pop_free(struct turnstile *turnstile)
+{
+    struct turnstile *free_turnstile;
+
+    free_turnstile = turnstile->next_free;
+
+    if (free_turnstile == NULL) {
+        return NULL;
+    }
+
+    turnstile->next_free = free_turnstile->next_free;
+    free_turnstile->next_free = NULL;
+    return free_turnstile;
+}
+
+struct turnstile *
+turnstile_lend(const void *sync_obj)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *turnstile, *prev;
+    struct turnstile_td *td;
+
+    assert(sync_obj != NULL);
+
+    turnstile = thread_turnstile_lend();
+    turnstile_assert_init_state(turnstile);
+
+    td = thread_turnstile_td(thread_self());
+    bucket = turnstile_bucket_get(sync_obj);
+
+    spinlock_lock(&bucket->lock);
+
+    prev = turnstile_bucket_lookup(bucket, sync_obj);
+
+    if (prev == NULL) {
+        turnstile_use(turnstile, sync_obj);
+        turnstile_bucket_add(bucket, turnstile);
+    } else {
+        turnstile_push_free(prev, turnstile);
+        turnstile = prev;
+    }
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_turnstile(td, turnstile);
+    spinlock_unlock(&td->lock);
+
+    return turnstile;
+}
+
+void
+turnstile_return(struct turnstile *turnstile)
+{
+    struct turnstile_bucket *bucket;
+    struct turnstile *free_turnstile;
+    struct turnstile_td *td;
+
+    assert(turnstile_in_use(turnstile));
+
+    td = thread_turnstile_td(thread_self());
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_turnstile(td, NULL);
+    spinlock_unlock(&td->lock);
+
+    bucket = turnstile->bucket;
+    free_turnstile = turnstile_pop_free(turnstile);
+
+    if (free_turnstile == NULL) {
+        turnstile_bucket_remove(bucket, turnstile);
+        turnstile_unuse(turnstile);
+        free_turnstile = turnstile;
+    }
+
+    spinlock_unlock(&bucket->lock);
+
+    turnstile_assert_init_state(free_turnstile);
+    thread_turnstile_return(free_turnstile);
+}
+
+bool
+turnstile_empty(const struct turnstile *turnstile)
+{
+    return plist_empty(&turnstile->waiters);
+}
+
+static void
+turnstile_set_owner(struct turnstile *turnstile, struct thread *owner)
+{
+    struct turnstile_td *td;
+
+    assert(owner != NULL);
+    assert((turnstile->owner == NULL) || (turnstile->owner == owner));
+
+    td = thread_turnstile_td(owner);
+
+    thread_ref(owner);
+    spinlock_lock(&td->lock);
+
+    if (turnstile->owner == NULL) {
+        turnstile_td_own(td, turnstile);
+    }
+
+    spinlock_unlock(&turnstile->bucket->lock);
+
+    turnstile_td_propagate_priority_loop(td);
+
+    spinlock_lock(&turnstile->bucket->lock);
+}
+
+void
+turnstile_wait(struct turnstile *turnstile, const char *wchan,
+               struct thread *owner)
+{
+    struct turnstile_waiter waiter;
+    struct turnstile_td *td;
+    struct thread *thread;
+
+    thread = thread_self();
+    assert(thread != owner);
+
+    td = thread_turnstile_td(thread);
+
+    spinlock_lock(&td->lock);
+    turnstile_waiter_init(&waiter, thread);
+    turnstile_add_waiter(turnstile, &waiter);
+    turnstile_td_set_waiter(td, &waiter);
+    spinlock_unlock(&td->lock);
+
+    if (owner == NULL) {
+        if (turnstile->top_waiter == &waiter) {
+            turnstile_waiter_set_awaken(&waiter);
+        }
+    } else {
+        /* This function temporarily unlocks the turnstile */
+        turnstile_set_owner(turnstile, owner);
+    }
+
+    for (;;) {
+        if (!turnstile_waiter_awaken(&waiter)) {
+            thread_sleep(&turnstile->bucket->lock, turnstile->sync_obj, wchan);
+        }
+
+        /*
+         * The real priority of a thread may change between waking up
+         * and reacquiring the turnstile.
+         */
+        if (turnstile->top_waiter == &waiter) {
+            break;
+        }
+
+        /* Otherwise, make sure the new top waiter is awaken */
+        turnstile_waiter_wakeup(turnstile->top_waiter);
+        turnstile_waiter_clear_awaken(&waiter);
+    }
+
+    spinlock_lock(&td->lock);
+    turnstile_td_set_waiter(td, NULL);
+    turnstile_remove_waiter(turnstile, &waiter);
+    spinlock_unlock(&td->lock);
+}
+
+void
+turnstile_signal(struct turnstile *turnstile)
+{
+    struct turnstile_waiter *waiter;
+
+    if (turnstile_empty(turnstile)) {
+        return;
+    }
+
+    waiter = plist_last_entry(&turnstile->waiters,
+                              struct turnstile_waiter, node);
+    turnstile_waiter_wakeup(waiter);
+}
+
+void
+turnstile_own(struct turnstile *turnstile)
+{
+    struct turnstile_td *td;
+    struct thread *owner;
+    unsigned int top_priority;
+
+    assert(turnstile->owner == NULL);
+
+    if (turnstile_empty(turnstile)) {
+        return;
+    }
+
+    owner = thread_self();
+    top_priority = turnstile_waiter_priority(turnstile->top_waiter);
+    assert(thread_real_global_priority(owner) >= top_priority);
+    td = thread_turnstile_td(owner);
+
+    spinlock_lock(&td->lock);
+    turnstile_td_own(td, turnstile);
+    spinlock_unlock(&td->lock);
+}
+
+void
+turnstile_disown(struct turnstile *turnstile)
+{
+    struct turnstile_td *td;
+    struct thread *owner;
+
+    owner = thread_self();
+    assert(turnstile->owner == owner);
+    assert(!turnstile_empty(turnstile));
+
+    td = thread_turnstile_td(owner);
+
+    spinlock_lock(&td->lock);
+    turnstile_td_disown(td, turnstile);
+    spinlock_unlock(&td->lock);
+}
diff --git a/kern/turnstile.h b/kern/turnstile.h
new file mode 100644
index 00000000..21a5a1a9
--- /dev/null
+++ b/kern/turnstile.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Priority propagation capable sleep queues.
+ *
+ * Turnstiles are used to build sleeping synchronization primitives where
+ * ownership applies, such as mutexes. They allow threads with different
+ * priorities to contend on the same synchronization object without
+ * unbounded priority inversion.
+ */
+
+#ifndef _KERN_TURNSTILE_H
+#define _KERN_TURNSTILE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include <kern/plist.h>
+#include <kern/spinlock.h>
+#include <kern/thread.h>
+#include <kern/turnstile_types.h>
+
+struct turnstile;
+
+/*
+ * Turnstile thread data.
+ */
+struct turnstile_td;
+
+#define turnstile_td_assert_lock(td) spinlock_assert_locked(&(td)->lock)
+
+/*
+ * Initialize turnstile thread data.
+ */
+static inline void
+turnstile_td_init(struct turnstile_td *td)
+{
+    spinlock_init(&td->lock);
+    td->turnstile = NULL;
+    td->waiter = NULL;
+    plist_init(&td->owned_turnstiles);
+    td->top_global_priority = 0;
+}
+
+/*
+ * Turnstile thread data locking functions.
+ */
+
+static inline void
+turnstile_td_lock(struct turnstile_td *td)
+{
+    spinlock_lock(&td->lock);
+}
+
+static inline int
+turnstile_td_trylock(struct turnstile_td *td)
+{
+    return spinlock_trylock(&td->lock);
+}
+
+static inline void
+turnstile_td_unlock(struct turnstile_td *td)
+{
+    spinlock_unlock(&td->lock);
+}
+
+/*
+ * Functions managing the turnstile a thread is sleeping in.
+ */
+
+static inline void
+turnstile_td_set_turnstile(struct turnstile_td *td,
+                           struct turnstile *turnstile)
+{
+    td->turnstile = turnstile;
+}
+
+static inline struct turnstile *
+turnstile_td_get_turnstile(const struct turnstile_td *td)
+{
+    return td->turnstile;
+}
+
+/*
+ * Propagate priority starting at the thread containing the given thread data.
+ */
+void turnstile_td_propagate_priority(struct turnstile_td *td);
+
+/*
+ * Initialize the turnstile module.
+ */
+void turnstile_setup(void);
+
+/*
+ * Create/destroy a turnstile.
+ */
+struct turnstile * turnstile_create(void);
+void turnstile_destroy(struct turnstile *turnstile);
+
+/*
+ * Acquire/release a turnstile.
+ *
+ * Acquiring a turnstile serializes all access and disables preemption.
+ */
+struct turnstile * turnstile_acquire(const void *sync_obj);
+void turnstile_release(struct turnstile *turnstile);
+
+/*
+ * Lend/return a turnstile.
+ *
+ * A thread lends its private turnstile to the turnstile module in
+ * order to prepare its sleep. The turnstile obtained on lending
+ * is either the thread's turnstile, or an already existing turnstile
+ * for this synchronization object if another thread is waiting.
+ *
+ * When multiple threads are waiting on the same turnstile, the extra
+ * turnstiles lent are kept in an internal free list, used when threads
+ * are awaken to return a turnstile to them.
+ *
+ * Note that the turnstile returned may not be the one lent.
+ *
+ * The turnstile obtained when lending is automatically acquired.
+ */
+struct turnstile * turnstile_lend(const void *sync_obj);
+void turnstile_return(struct turnstile *turnstile);
+
+/*
+ * Return true if the given turnstile has no waiters.
+ *
+ * The turnstile must be acquired when calling this function.
+ */
+bool turnstile_empty(const struct turnstile *turnstile);
+
+/*
+ * Wait for a wake up on the given turnstile.
+ *
+ * The turnstile must be lent when calling this function. It is
+ * released and later reacquired before returning from this function.
+ *
+ * The calling thread is considered a waiter as long as it didn't
+ * reacquire the turnstile. This means that signalling a turnstile
+ * has no visible effect on the number of waiters until the turnstile
+ * is released, e.g. if a single thread is waiting and another signals
+ * the turnstile, the turnstile is not immediately considered empty.
+ *
+ * If owner isn't NULL, it must refer to the thread currently owning
+ * the associated synchronization object. The priority of the caller
+ * is propagated to the chain of turnstiles and owners as necessary
+ * to prevent unbounded priority inversion.
+ */
+void turnstile_wait(struct turnstile *turnstile, const char *wchan,
+                    struct thread *owner);
+
+/*
+ * Wake up a thread waiting on the given turnstile, if any.
+ *
+ * The turnstile must be acquired when calling this function.
+ * Since a turnstile must be lent (and in turn is automatically
+ * acquired) when waiting, and acquired in order to signal it,
+ * wake-ups are serialized and cannot be missed.
+ */
+void turnstile_signal(struct turnstile *turnstile);
+
+/*
+ * Own/disown a turnstile.
+ *
+ * The turnstile must be lent when taking ownership, acquired when
+ * releasing it. Owning has no effect on empty turnstiles.
+ * Conversely, an empty turnstile cannot be disowned.
+ *
+ * Ownership must be updated atomically with regard to the ownership
+ * of the associated synchronization object.
+ */
+void turnstile_own(struct turnstile *turnstile);
+void turnstile_disown(struct turnstile *turnstile);
+
+#endif /* _KERN_TURNSTILE_H */
diff --git a/kern/turnstile_types.h b/kern/turnstile_types.h
new file mode 100644
index 00000000..a95b691e
--- /dev/null
+++ b/kern/turnstile_types.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_TURNSTILE_TYPES_H
+#define _KERN_TURNSTILE_TYPES_H
+
+#include <kern/plist_types.h>
+#include <kern/spinlock_types.h>
+
+struct turnstile;
+struct turnstile_waiter;
+
+/*
+ * Per-thread turnstile data.
+ *
+ * The turnstile member indicates whether this thread is in a turnstile,
+ * and is only valid if the thread is not running.
+ *
+ * The waiter points to the structure a thread uses to queue itself on
+ * a turnstile. It is used to access a sleeping thread from another
+ * thread, e.g. on wake-ups or priority updates.
+ *
+ * The list of owned turnstiles is used by priority propagation to
+ * determine the top priority among all waiters, which is stored in
+ * the thread data so that turnstiles are quickly unlocked.
+ *
+ * Locking keys :
+ * (b) bucket
+ * (t) turnstile_td
+ */
+struct turnstile_td {
+    struct spinlock lock;
+    struct turnstile *turnstile;                /* (t)   */
+    struct turnstile_waiter *waiter;            /* (b,t) */
+    struct plist owned_turnstiles;              /* (t)   */
+    unsigned int top_global_priority;           /* (t)   */
+    unsigned char top_sched_policy;             /* (t)   */
+    unsigned short top_priority;                /* (t)   */
+};
+
+#endif /* _KERN_TURNSTILE_TYPES_H */
diff --git a/test/test_mutex_pi.c b/test/test_mutex_pi.c
new file mode 100644
index 00000000..1adcf627
--- /dev/null
+++ b/test/test_mutex_pi.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2017 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This test module is a stress test, expected to never terminate, of
+ * priority inheritance with mutexes. It creates a priority inheritance
+ * tree by starting multiple threads manipulating multiple mutexes.
+ *
+ * Here is one intended state of the priority inheritance tree :
+ *
+ *
+ *            C->M2-+
+ *                  |
+ *         D--+     +->B->M1->A
+ *            |     |
+ *            +->M3-+
+ *            |
+ *         E--+
+ *
+ *
+ * M1,M2,M3,etc... are mutexes and A,B,C,etc... are threads. The thread
+ * priorities p(thread) are ordered such that p(A) < p(B) < p(C) etc...
+ * An arrow from a mutex to a thread indicates ownership, such that
+ * M1->A means that A owns M1. An arrow from a thread to a mutex indicates
+ * waiting, such that B->M1 means that B is waiting for M1 to be unlocked.
+ *
+ * In addition, thread B is actually many threads, each terminating after
+ * unlocking their mutexes. Also, the priority of thread C is regularly
+ * increased to p(E) + 1 and later restored to p(C).
+ *
+ * Here is the list of all the cases this test must cover :
+ *  - Priority inheritance: All threads can get their real priority boosted.
+ *    C can be boosted if it owns M2 and B waits for it while inheriting
+ *    from E, and E can be boosted when p(C) is temporarily increased.
+ *  - Return to normal priority: after releasing all locks, a thread must
+ *    have reset its real priority to its user priority.
+ *  - Priority changes: When the priority of C is increased, that priority
+ *    must take precedence over all others.
+ *  - Thread destruction resilience: Check that priority propagation never
+ *    accesses a destroyed thread.
+ *
+ * Note that this test doesn't check that priority propagation correctly
+ * adjusts the top priority after lowering the priority of thread C back
+ * to p(C).
+ *
+ * In order to artificially create priority inversions, all threads run on
+ * separate processors, making this test require 5 processors.
+ *
+ * TODO Use timers instead of busy-waiting so that binding to processors
+ * isn't required.
+ *
+ * The test should output a couple of messages about thread priorities
+ * being boosted, and then frequent updates from each thread to show
+ * they're all making progress. Thread B suffers from contention the most
+ * so its report frequency should be lower. Thread A suffers from contention
+ * the least and should be the most frequent to report progress. Because of
+ * contention from B, D and E on M3, D rarely gets boosted. The reason is
+ * that, when B owns the mutex, E is likely to wait on M3 soon enough that
+ * it will be awaken before D, preventing the conditions for priority
+ * inheritance to occur.
+ *
+ * Note that the test uses regular mutexes instead of real-time mutexes,
+ * so that its behaviour can be analyzed for both types depending on
+ * build options.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include <kern/cpumap.h>
+#include <kern/error.h>
+#include <kern/mutex.h>
+#include <kern/panic.h>
+#include <kern/printk.h>
+#include <kern/thread.h>
+#include <kern/turnstile.h>
+#include <test/test.h>
+
+#define TEST_PRIO_A (THREAD_SCHED_RT_PRIO_MIN + 1)
+#define TEST_PRIO_B (TEST_PRIO_A + 1)
+#define TEST_PRIO_C (TEST_PRIO_B + 1)
+#define TEST_PRIO_D (TEST_PRIO_C + 1)
+#define TEST_PRIO_E (TEST_PRIO_D + 1)
+
+#define TEST_NR_LOCK_LOOPS          500
+#define TEST_NR_CONSUME_CPU_LOOPS   10000000
+
+static struct mutex test_mutex_1;
+static struct mutex test_mutex_2;
+static struct mutex test_mutex_3;
+
+static const char *
+test_thread_from_priority(unsigned short priority)
+{
+    switch (priority) {
+    case TEST_PRIO_A:
+        return "a";
+    case TEST_PRIO_B:
+        return "b";
+    case TEST_PRIO_C:
+        return "c";
+    case TEST_PRIO_D:
+        return "d";
+    case TEST_PRIO_E:
+        return "e";
+    case TEST_PRIO_E + 1:
+        return "e+";
+    default:
+        panic("invalid priority %u", priority);
+    }
+}
+
+static char
+test_get_name(void)
+{
+    const char *name;
+    size_t length;
+
+    name = thread_self()->name;
+    length = strlen(name);
+    return name[length - 1];
+}
+
+static void
+test_consume_cpu(void)
+{
+    volatile unsigned int i;
+
+    for (i = 0; i < TEST_NR_CONSUME_CPU_LOOPS; i++);
+}
+
+static void
+test_check_initial_priority(void)
+{
+    unsigned short user_priority, real_priority;
+    struct thread *thread;
+
+    thread = thread_self();
+    user_priority = thread_user_priority(thread);
+    real_priority = thread_real_priority(thread);
+
+    if (user_priority != real_priority) {
+        panic("%c: invalid initial priority %hu",
+              test_get_name(), real_priority);
+    }
+}
+
+static void
+test_for_priority_boosted(unsigned short *highest_priority)
+{
+    unsigned short user_priority, real_priority;
+    struct turnstile_td *td;
+    struct thread *thread;
+
+    thread = thread_self();
+    td = thread_turnstile_td(thread);
+
+    turnstile_td_lock(td);
+
+    user_priority = thread_user_priority(thread);
+    real_priority = thread_real_priority(thread);
+
+    if (user_priority != real_priority) {
+        if (user_priority > real_priority) {
+            panic("%c: invalid real priority: %hu (boosted:%u)",
+                  test_get_name(), real_priority, thread->boosted);
+        }
+
+        if (real_priority > *highest_priority) {
+            printk("%c: real priority boosted to %s\n",
+                   test_get_name(), test_thread_from_priority(real_priority));
+            *highest_priority = real_priority;
+        }
+    }
+
+    turnstile_td_unlock(td);
+}
+
+static void
+test_for_priority_deboosted(void)
+{
+    unsigned short user_priority, real_priority;
+    struct turnstile_td *td;
+    struct thread *thread;
+
+    thread = thread_self();
+    td = thread_turnstile_td(thread);
+
+    turnstile_td_lock(td);
+
+    user_priority = thread_user_priority(thread);
+    real_priority = thread_real_priority(thread);
+
+    if (user_priority != real_priority) {
+        panic("%c: real priority not reset (boosted:%d)", test_get_name(), thread->boosted);
+    }
+
+    turnstile_td_unlock(td);
+}
+
+static void
+test_report_progress(unsigned int i)
+{
+    printk("%c:%u ", test_get_name(), i);
+}
+
+static void
+test_a(void *arg)
+{
+    unsigned short highest_priority;
+    unsigned int i, j;
+
+    (void)arg;
+
+    test_check_initial_priority();
+
+    highest_priority = 0;
+
+    for (i = 1; /* no condition */; i++) {
+        for (j = 0; j < TEST_NR_LOCK_LOOPS; j++) {
+            mutex_lock(&test_mutex_1);
+            test_consume_cpu();
+            test_for_priority_boosted(&highest_priority);
+            mutex_unlock(&test_mutex_1);
+
+            test_for_priority_deboosted();
+
+            test_consume_cpu();
+        }
+
+        test_report_progress(i);
+    }
+}
+
+static void
+test_b(void *arg)
+{
+    test_check_initial_priority();
+
+    mutex_lock(&test_mutex_3);
+    mutex_lock(&test_mutex_2);
+    mutex_lock(&test_mutex_1);
+    test_consume_cpu();
+    test_for_priority_boosted(arg);
+    mutex_unlock(&test_mutex_1);
+    test_consume_cpu();
+    mutex_unlock(&test_mutex_2);
+    test_consume_cpu();
+    mutex_unlock(&test_mutex_3);
+
+    /*
+     * It would be better if the thread could immediately terminate, but
+     * it's also the thread that locks multiple mutexes, so make sure it
+     * was correctly deboosted. This should be cheap enough to not matter
+     * much.
+     */
+    test_for_priority_deboosted();
+}
+
+static void
+test_manage_b(void *arg)
+{
+    unsigned short highest_priority;
+    struct thread_attr attr;
+    struct thread *thread_b;
+    struct cpumap *cpumap;
+    unsigned int i, j;
+    int error;
+
+    (void)arg;
+
+    error = cpumap_create(&cpumap);
+    error_check(error, "cpumap_create");
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 1);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_b");
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_B);
+    thread_attr_set_cpumap(&attr, cpumap);
+    cpumap_destroy(cpumap);
+
+    highest_priority = 0;
+
+    for (i = 1; /* no condition */; i++) {
+        for (j = 0; j < TEST_NR_LOCK_LOOPS; j++) {
+            error = thread_create(&thread_b, &attr, test_b, &highest_priority);
+            error_check(error, "thread_create");
+            thread_join(thread_b);
+
+            test_consume_cpu();
+        }
+
+        printk("b:%u ", i);
+    }
+}
+
+static void
+test_c(void *arg)
+{
+    unsigned short highest_priority;
+    unsigned int i, j;
+
+    (void)arg;
+
+    test_check_initial_priority();
+
+    highest_priority = 0;
+
+    for (i = 1; /* no condition */; i++) {
+        for (j = 0; j < TEST_NR_LOCK_LOOPS; j++) {
+            mutex_lock(&test_mutex_2);
+            test_consume_cpu();
+            test_for_priority_boosted(&highest_priority);
+            mutex_unlock(&test_mutex_2);
+
+            test_for_priority_deboosted();
+
+            test_consume_cpu();
+        }
+
+        test_report_progress(i);
+    }
+}
+
+static void
+test_chprio_c(void *arg)
+{
+    struct thread *thread_c;
+
+    thread_c = arg;
+
+    test_consume_cpu();
+
+    for (;;) {
+        thread_setscheduler(thread_c, THREAD_SCHED_POLICY_FIFO,
+                            TEST_PRIO_E + 1);
+        thread_setscheduler(thread_c, THREAD_SCHED_POLICY_FIFO,
+                            TEST_PRIO_C);
+    }
+}
+
+static void
+test_d(void *arg)
+{
+    unsigned short highest_priority;
+    unsigned int i, j;
+
+    (void)arg;
+
+    test_check_initial_priority();
+
+    highest_priority = 0;
+
+    for (i = 1; /* no condition */; i++) {
+        for (j = 0; j < TEST_NR_LOCK_LOOPS; j++) {
+            mutex_lock(&test_mutex_3);
+            test_consume_cpu();
+            test_for_priority_boosted(&highest_priority);
+            mutex_unlock(&test_mutex_3);
+
+            test_for_priority_deboosted();
+
+            test_consume_cpu();
+        }
+
+        test_report_progress(i);
+    }
+}
+
+static void
+test_e(void *arg)
+{
+    unsigned short highest_priority;
+    unsigned int i, j;
+
+    (void)arg;
+
+    test_check_initial_priority();
+
+    highest_priority = 0;
+
+    for (i = 1; /* no condition */; i++) {
+        for (j = 0; j < TEST_NR_LOCK_LOOPS; j++) {
+            mutex_lock(&test_mutex_3);
+            test_consume_cpu();
+            test_for_priority_boosted(&highest_priority);
+            mutex_unlock(&test_mutex_3);
+
+            test_for_priority_deboosted();
+
+            test_consume_cpu();
+        }
+
+        test_report_progress(i);
+    }
+}
+
+void
+test_setup(void)
+{
+    struct thread_attr attr;
+    struct thread *thread;
+    struct cpumap *cpumap;
+    int error;
+
+    mutex_init(&test_mutex_1);
+    mutex_init(&test_mutex_2);
+    mutex_init(&test_mutex_3);
+
+    error = cpumap_create(&cpumap);
+    error_check(error, "cpumap_create");
+
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 0);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_a");
+    thread_attr_set_detached(&attr);
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_A);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_a, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 1);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_manage_b");
+    thread_attr_set_detached(&attr);
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_B);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_manage_b, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 2);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_c");
+    thread_attr_set_detached(&attr);
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_C);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_c, NULL);
+    error_check(error, "thread_create");
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_chprio_c");
+    thread_attr_set_detached(&attr);
+    error = thread_create(&thread, &attr, test_chprio_c, thread);
+    error_check(error, "thread_create");
+
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 3);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_d");
+    thread_attr_set_detached(&attr);
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_D);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_d, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 4);
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_e");
+    thread_attr_set_detached(&attr);
+    thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_FIFO);
+    thread_attr_set_priority(&attr, TEST_PRIO_E);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_e, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_destroy(cpumap);
+}
author	Richard Braun <rbraun@sceen.net>	2017-03-04 17:02:09 +0100
committer	Richard Braun <rbraun@sceen.net>	2017-03-04 17:02:09 +0100
commit	0b2db2776b3a4fb208f13867934147f8cef79b60 (patch)
tree	a685ad29009ecfe1aacc9901b67490fb2b6db48f
parent	cee25b55290865734d92a6b6262fe79887288b1a (diff)
parent	4121baff3354f8bd1a4ae62b68af29c6f3965462 (diff)