From f923f7a361c8a0b4d081dc04672a59ccd8828704 Mon Sep 17 00:00:00 2001
From: Richard Braun <rbraun@sceen.net>
Date: Tue, 20 Feb 2018 20:42:29 +0100
Subject: kern/thread: remove sref registration calls

The upcoming RCU implementation requires scalable reference counters to
be usable in interrupt context, and in particular, in the periodic tick
handler, when an idle thread may be running, and the local processor is
not registered.
---
 kern/thread.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/kern/thread.c b/kern/thread.c
index 20378ffb..63af1bde 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -2108,7 +2108,6 @@ static void
 thread_idle(void *arg)
 {
     struct thread *self;
-    int error;
 
     (void)arg;
 
@@ -2116,13 +2115,6 @@ thread_idle(void *arg)
 
     for (;;) {
         thread_preempt_disable();
-        error = sref_unregister();
-
-        if (error) {
-            assert(error == ERROR_BUSY);
-            goto error_sref;
-        }
-
         llsync_unregister();
 
         for (;;) {
@@ -2137,9 +2129,6 @@ thread_idle(void *arg)
         }
 
         llsync_register();
-        sref_register();
-
-error_sref:
         thread_preempt_enable();
     }
 }
-- 
cgit v1.2.3


From 22dff6b7a6839e77d713d671cb2038e56b64ac16 Mon Sep 17 00:00:00 2001
From: Richard Braun <rbraun@sceen.net>
Date: Tue, 20 Feb 2018 20:45:13 +0100
Subject: kern/rcu: new module

This module implements preemptible RCU.
---
 kern/Makefile         |   1 +
 kern/clock.c          |   2 +
 kern/rcu.c            | 772 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kern/rcu.h            | 146 ++++++++++
 kern/rcu_i.h          |  61 ++++
 kern/rcu_types.h      |  40 +++
 kern/thread.c         |   5 +
 kern/thread.h         |  10 +
 kern/thread_i.h       |   3 +
 test/Kconfig          |   3 +
 test/Makefile         |   1 +
 test/test_rcu_defer.c | 231 +++++++++++++++
 12 files changed, 1275 insertions(+)
 create mode 100644 kern/rcu.c
 create mode 100644 kern/rcu.h
 create mode 100644 kern/rcu_i.h
 create mode 100644 kern/rcu_types.h
 create mode 100644 test/test_rcu_defer.c

diff --git a/kern/Makefile b/kern/Makefile
index 0aa96fc3..6c0778b4 100644
--- a/kern/Makefile
+++ b/kern/Makefile
@@ -20,6 +20,7 @@ x15_SOURCES-y += \
         kern/plist.c \
         kern/printf.c \
         kern/rbtree.c \
+        kern/rcu.c \
         kern/rdxtree.c \
         kern/rtmutex.c \
         kern/semaphore.c \
diff --git a/kern/clock.c b/kern/clock.c
index 19f823c3..c09ffddc 100644
--- a/kern/clock.c
+++ b/kern/clock.c
@@ -24,6 +24,7 @@
 #include <kern/init.h>
 #include <kern/llsync.h>
 #include <kern/percpu.h>
+#include <kern/rcu.h>
 #include <kern/sref.h>
 #include <kern/syscnt.h>
 #include <kern/thread.h>
@@ -91,6 +92,7 @@ void clock_tick_intr(void)
 
     timer_report_periodic_event();
     llsync_report_periodic_event();
+    rcu_report_periodic_event();
     sref_report_periodic_event();
     work_report_periodic_event();
     thread_report_periodic_event();
diff --git a/kern/rcu.c b/kern/rcu.c
new file mode 100644
index 00000000..82c62df3
--- /dev/null
+++ b/kern/rcu.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This implementation is based on the paper "Extending RCU for Realtime
+ * and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma,
+ * and Suparna Bhattacharya. Beside the mechanisms not implemented yet,
+ * such as priority boosting, the differences are described below.
+ *
+ * First, this implementation uses scalable reference counters provided
+ * by the sref module instead of per-CPU counters as described in the paper.
+ * The main benefit of this approach is the centralization of most scalability
+ * improvements in the sref module, which should propagate to all sref users,
+ * including RCU.
+ *
+ * In addition, this implementation introduces the concept of windows, where
+ * a window is a range in time to which readers may be linked. Here, a
+ * grace period is defined as the time range at the end of a window where
+ * various synchronization steps are performed to enforce the RCU guarantees.
+ * The minimum duration of a window acts as a knob allowing users to tune
+ * the behavior of the RCU system.
+ *
+ * Finally, the state machine described in the paper is updated to accommodate
+ * for windows, since grace periods don't run back-to-back to each other.
+ * Windows are regularly checked and flipped if the previous one isn't
+ * active any more. From that moment, processors may notice the global flip
+ * and perform a local flip of their work window ID. Once all processors
+ * have acknowleged the flip, it is certain that no new work may be queued
+ * on the previous window. At this point, the same occurs for the
+ * processor-local reader window ID, and once all processors have
+ * acknowleged that flip, there can be no new reader linked to the previous
+ * window. The RCU system then releases its own reference to the previous
+ * window and waits for the window reference counter to drop to 0, indicating
+ * that all readers linked to the previous window have left their read-side
+ * critical section. When this global event occurs, processors are requested
+ * to flush the works queued for the previous window, and once they all have
+ * acknowleged their flush, the window ends and becomes inactive, allowing
+ * a new grace period to occur later on.
+ *
+ * Here is an informal diagram describing this process :
+ *
+ * t ---->
+ *
+ *    reader window flip ---+     +--- no more readers
+ * work window flip ------+ |     | +- works flushed
+ * (grace period start)   | |     | |  (grace period / window end)
+ *                        v v     v v
+ *         +--------------+-+-----+-+
+ *         |              . .     . |
+ *         |  window 0    . . gp  . |
+ *         |      removal . .     . | reclamation
+ *         +--------------+-+-----+-+-----+----+
+ *                        |               .    |
+ *                        |  window 1     . gp |
+ *                        |       removal .    | reclamation
+ *                        +---------------+----+--------
+ *                                        |
+ *                                        |  window 2   ...
+ *                                        |
+ *                                        +-------------
+ *
+ * TODO Improve atomic acknowledgment scalability.
+ * TODO Handle large amounts of deferred works.
+ * TODO Priority boosting of slow readers.
+ * TODO CPU registration for dyntick-friendly behavior.
+ */
+
+#include <assert.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <kern/atomic.h>
+#include <kern/clock.h>
+#include <kern/init.h>
+#include <kern/macros.h>
+#include <kern/rcu.h>
+#include <kern/panic.h>
+#include <kern/percpu.h>
+#include <kern/spinlock.h>
+#include <kern/sref.h>
+#include <kern/syscnt.h>
+#include <kern/thread.h>
+#include <kern/timer.h>
+#include <kern/work.h>
+#include <machine/cpu.h>
+
+/*
+ * Negative close to 0 so that an overflow occurs early.
+ */
+#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500)
+
+/*
+ * Interval between window checking.
+ *
+ * When windows are checked, a flip occurs if the previous window isn't
+ * active any more.
+ */
+#define RCU_WINDOW_CHECK_INTERVAL_MS 10
+
+/*
+ * Grace period states.
+ *
+ * These states are only used to trigger per-CPU processing that is
+ * globally acknowleged by decrementing a global atomic counter. They
+ * do not completely represent the actual state of a grace period.
+ */
+enum rcu_gp_state {
+    RCU_GP_STATE_WORK_WINDOW_FLIP,
+    RCU_GP_STATE_READER_WINDOW_FLIP,
+    RCU_GP_STATE_WORK_FLUSH,
+};
+
+/*
+ * Per-CPU view of a window.
+ *
+ * Deferred works are scheduled when the window ends.
+ */
+struct rcu_cpu_window {
+    struct work_queue works;
+};
+
+/*
+ * Per-CPU RCU data.
+ *
+ * Each processor maintains two local window IDs. One is used as the current
+ * window ID when deferring work, the other when detecting a reader. A local
+ * flip occurs when a processor notices that the global grace period state
+ * no longer matches the local grace period state. These checks only occur
+ * on periodic events.
+ *
+ * Interrupts and preemption must be disabled when accessing local CPU data.
+ */
+struct rcu_cpu_data {
+    enum rcu_gp_state gp_state;
+    unsigned int work_wid;
+    unsigned int reader_wid;
+    struct rcu_cpu_window windows[2];
+    struct syscnt sc_nr_detected_readers;
+};
+
+/*
+ * Global window.
+ *
+ * A window is a time range that tracks read-side references. Conceptually,
+ * each reader adds a reference to the current window. In practice, references
+ * are only added when readers are detected, which occurs on a context switch
+ * (to track preempted threads) or a reader window flip (to prevent currently
+ * running readers to be linked to the next window).
+ *
+ * When a window is started, its scalable reference counter is initialized
+ * with a reference owned by the RCU system. That reference guarantees that
+ * the window remains active as long as new readers may add references,
+ * since it prevents the counter from dropping to 0. After a reader window
+ * flip, there may not be new references to the window, and the initial
+ * reference is dropped, allowing the counter to reach 0 once all detected
+ * readers leave their critical section and unreference the window they're
+ * linked to.
+ */
+struct rcu_window {
+    struct sref_counter nr_refs;
+    uint64_t start_ts;
+    bool active;
+};
+
+/*
+ * Global data.
+ *
+ * Processors regularly check the grace period state against their own,
+ * locally cached grace period state, and take action whenever they differ.
+ * False sharing is avoided by making the global grace period state fill an
+ * entire cache line on SMP.
+ *
+ * After processors notice a grace period state change, they acknowledge
+ * noticing this change by decrementing the atomic acknowledgment counter,
+ * which also fills a complete cache line on SMP in order to restrict cache
+ * line bouncing. Atomic operations on this counter are done with
+ * acquire-release ordering to enforce the memory ordering guarantees
+ * required by the implementation, as well as those provided by the public
+ * interface.
+ *
+ * In addition to the global window ID and the windows themselves, the data
+ * include a timer, used to trigger the end of windows, i.e. grace periods.
+ * Since the timer function, atomic acknowledgments, and window no-reference
+ * function chain each other, there is currently no need for a global lock.
+ */
+struct rcu_data {
+    struct {
+        alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state;
+    };
+    struct {
+        alignas(CPU_L1_SIZE) unsigned int nr_acks;
+    };
+
+    unsigned int wid;
+    struct rcu_window windows[2];
+    struct timer timer;
+    struct syscnt sc_nr_windows;
+    struct syscnt sc_last_window_ms;
+    struct syscnt sc_longest_window_ms;
+};
+
+/*
+ * Structure used to implement rcu_wait().
+ */
+struct rcu_waiter {
+    struct work work;
+    struct spinlock lock;
+    struct thread *thread;
+    bool done;
+};
+
+static struct rcu_data rcu_data;
+static struct rcu_cpu_data rcu_cpu_data __percpu;
+
+static struct rcu_cpu_data *
+rcu_get_cpu_data(void)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    return cpu_local_ptr(rcu_cpu_data);
+}
+
+static enum rcu_gp_state
+rcu_data_get_gp_state(const struct rcu_data *data)
+{
+    return data->gp_state;
+}
+
+static unsigned int
+rcu_data_get_wid(const struct rcu_data *data)
+{
+    return data->wid;
+}
+
+static struct rcu_window *
+rcu_data_get_window_from_index(struct rcu_data *data, size_t index)
+{
+    assert(index < ARRAY_SIZE(data->windows));
+    return &data->windows[index];
+}
+
+static struct rcu_window *
+rcu_data_get_window(struct rcu_data *data, unsigned int wid)
+{
+    return rcu_data_get_window_from_index(data, wid & 1);
+}
+
+static void
+rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state)
+{
+    assert(data->nr_acks  == 0);
+
+    switch (gp_state) {
+    case RCU_GP_STATE_WORK_WINDOW_FLIP:
+        assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH);
+        break;
+    case RCU_GP_STATE_READER_WINDOW_FLIP:
+        assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP);
+        break;
+    case RCU_GP_STATE_WORK_FLUSH:
+        assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP);
+        break;
+    default:
+        panic("rcu: invalid grace period state");
+    }
+
+    data->nr_acks = cpu_count();
+    atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE);
+}
+
+static bool
+rcu_data_check_gp_state(const struct rcu_data *data,
+                        enum rcu_gp_state local_gp_state,
+                        enum rcu_gp_state *global_gp_state)
+{
+    *global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED);
+
+    if (unlikely(local_gp_state != *global_gp_state)) {
+        atomic_fence_acquire();
+        return true;
+    }
+
+    return false;
+}
+
+static void
+rcu_window_end(struct rcu_window *window)
+{
+    assert(window->active);
+    window->active = false;
+}
+
+static void
+rcu_window_ref(struct rcu_window *window)
+{
+    sref_counter_inc(&window->nr_refs);
+}
+
+static void
+rcu_window_unref(struct rcu_window *window)
+{
+    sref_counter_dec(&window->nr_refs);
+}
+
+static uint64_t
+rcu_window_get_start_ts(const struct rcu_window *window)
+{
+    return window->start_ts;
+}
+
+static void
+rcu_window_flush(struct sref_counter *counter)
+{
+    (void)counter;
+
+    rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH);
+}
+
+static void __init
+rcu_window_init(struct rcu_window *window)
+{
+    window->active = false;
+}
+
+static void
+rcu_window_start(struct rcu_window *window)
+{
+    assert(!window->active);
+
+    sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush);
+    window->start_ts = clock_get_time();
+    window->active = true;
+}
+
+static bool
+rcu_window_active(const struct rcu_window *window)
+{
+    return window->active;
+}
+
+static void
+rcu_data_end_prev_window(struct rcu_data *data, uint64_t now)
+{
+    struct rcu_window *window;
+    uint64_t duration;
+
+    window = rcu_data_get_window(data, data->wid - 1);
+    duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window));
+    syscnt_set(&data->sc_last_window_ms, duration);
+
+    if (duration > syscnt_read(&data->sc_longest_window_ms)) {
+        syscnt_set(&data->sc_longest_window_ms, duration);
+    }
+
+    rcu_window_end(window);
+}
+
+static void
+rcu_data_schedule_timer(struct rcu_data *data, uint64_t now)
+{
+    uint64_t ticks;
+
+    ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS);
+    timer_schedule(&data->timer, now + ticks);
+}
+
+static void
+rcu_data_ack_cpu(struct rcu_data *data)
+{
+    struct rcu_window *window;
+    unsigned int prev_nr_acks;
+    uint64_t now;
+
+    prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL);
+
+    if (prev_nr_acks != 1) {
+        assert(prev_nr_acks != 0);
+        return;
+    }
+
+    switch (data->gp_state) {
+    case RCU_GP_STATE_WORK_WINDOW_FLIP:
+        rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP);
+        break;
+    case RCU_GP_STATE_READER_WINDOW_FLIP:
+        window = rcu_data_get_window(data, data->wid - 1);
+        rcu_window_unref(window);
+        break;
+    case RCU_GP_STATE_WORK_FLUSH:
+        now = clock_get_time();
+        rcu_data_end_prev_window(data, now);
+        rcu_data_schedule_timer(data, now);
+        break;
+    default:
+        panic("rcu: invalid grace period state");
+    }
+}
+
+static bool
+rcu_data_flip_windows(struct rcu_data *data)
+{
+    struct rcu_window *window;
+
+    window = rcu_data_get_window(data, data->wid - 1);
+
+    if (rcu_window_active(window)) {
+        return false;
+    }
+
+    rcu_window_start(window);
+    syscnt_inc(&data->sc_nr_windows);
+    data->wid++;
+    rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP);
+    return true;
+}
+
+static void
+rcu_data_check_windows(struct timer *timer)
+{
+    struct rcu_data *data;
+    bool flipped;
+
+    data = &rcu_data;
+    flipped = rcu_data_flip_windows(data);
+
+    if (!flipped) {
+        rcu_data_schedule_timer(data, timer_get_time(timer));
+    }
+}
+
+static void __init
+rcu_data_init(struct rcu_data *data)
+{
+    data->gp_state = RCU_GP_STATE_WORK_FLUSH;
+    data->nr_acks = 0;
+    data->wid = RCU_WINDOW_ID_INIT_VALUE;
+
+    for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) {
+        rcu_window_init(rcu_data_get_window_from_index(data, i));
+    }
+
+    rcu_window_start(rcu_data_get_window(data, data->wid));
+
+    timer_init(&data->timer, rcu_data_check_windows, 0);
+    rcu_data_schedule_timer(data, clock_get_time());
+
+    syscnt_register(&data->sc_nr_windows, "rcu_nr_windows");
+    syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms");
+    syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms");
+}
+
+static void __init
+rcu_cpu_window_init(struct rcu_cpu_window *cpu_window)
+{
+    work_queue_init(&cpu_window->works);
+}
+
+static void
+rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work)
+{
+    work_queue_push(&cpu_window->works, work);
+}
+
+static void
+rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window)
+{
+    work_queue_schedule(&cpu_window->works, 0);
+    work_queue_init(&cpu_window->works);
+}
+
+static unsigned int
+rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data)
+{
+    return cpu_data->reader_wid;
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index)
+{
+    assert(index < ARRAY_SIZE(cpu_data->windows));
+    return &cpu_data->windows[index];
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid)
+{
+    return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1);
+}
+
+static void __init
+rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu)
+{
+    struct rcu_data *data;
+    char name[SYSCNT_NAME_SIZE];
+
+    data = &rcu_data;
+
+    cpu_data->gp_state = rcu_data_get_gp_state(data);
+    cpu_data->work_wid = rcu_data_get_wid(data);
+    cpu_data->reader_wid = cpu_data->work_wid;
+
+    for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) {
+        rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i));
+    }
+
+    snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu);
+    syscnt_register(&cpu_data->sc_nr_detected_readers, name);
+}
+
+static void
+rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work)
+{
+    struct rcu_cpu_window *cpu_window;
+
+    cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid);
+    rcu_cpu_window_queue(cpu_window, work);
+}
+
+static void
+rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data)
+{
+    struct rcu_cpu_window *cpu_window;
+
+    assert(cpu_data->work_wid == cpu_data->reader_wid);
+
+    cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1);
+    rcu_cpu_window_flush(cpu_window);
+}
+
+void
+rcu_reader_init(struct rcu_reader *reader)
+{
+    reader->level = 0;
+    reader->linked = false;
+}
+
+static void
+rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(reader == thread_rcu_reader(thread_self()));
+    assert(!rcu_reader_linked(reader));
+
+    reader->wid = rcu_cpu_data_get_reader_wid(cpu_data);
+    reader->linked = true;
+}
+
+static void
+rcu_reader_unlink(struct rcu_reader *reader)
+{
+    assert(reader->level == 0);
+    reader->linked = false;
+}
+
+static void
+rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    struct rcu_window *window;
+    struct rcu_data *data;
+    unsigned int wid;
+
+    if (rcu_reader_linked(reader)) {
+        return;
+    }
+
+    data = &rcu_data;
+    wid = rcu_cpu_data_get_reader_wid(cpu_data);
+    window = rcu_data_get_window(data, wid);
+
+    rcu_reader_link(reader, cpu_data);
+    rcu_window_ref(window);
+
+    syscnt_inc(&cpu_data->sc_nr_detected_readers);
+}
+
+void
+rcu_reader_leave(struct rcu_reader *reader)
+{
+    struct rcu_window *window;
+    struct rcu_data *data;
+
+    data = &rcu_data;
+
+    window = rcu_data_get_window(data, reader->wid);
+    rcu_window_unref(window);
+    rcu_reader_unlink(reader);
+}
+
+static void
+rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    if (rcu_reader_in_cs(reader)) {
+        rcu_reader_enter(reader, cpu_data);
+    }
+}
+
+static void
+rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    cpu_data->work_wid++;
+}
+
+static void
+rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data);
+    cpu_data->reader_wid++;
+}
+
+static void
+rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data)
+{
+    enum rcu_gp_state local_gp_state, global_gp_state;
+    struct rcu_data *data;
+    bool diff;
+
+    data = &rcu_data;
+
+    for (;;) {
+        local_gp_state = cpu_data->gp_state;
+        diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state);
+
+        if (!diff) {
+            break;
+        }
+
+        switch (global_gp_state) {
+        case RCU_GP_STATE_WORK_WINDOW_FLIP:
+            rcu_cpu_data_flip_work_wid(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        case RCU_GP_STATE_READER_WINDOW_FLIP:
+            rcu_cpu_data_flip_reader_wid(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        case RCU_GP_STATE_WORK_FLUSH:
+            rcu_cpu_data_flush(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        default:
+            panic("rcu: invalid grace period state");
+        }
+
+        cpu_data->gp_state = global_gp_state;
+    }
+}
+
+void
+rcu_report_context_switch(struct rcu_reader *reader)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    /*
+     * Most readers don't need to be accounted for because their execution
+     * doesn't overlap with a grace period. If a reader is preempted however,
+     * it must be accounted in case a grace period starts while the reader
+     * is preempted. Accounting also occurs when a grace period starts, and
+     * more exactly, when the reader window ID of a processor is flipped.
+     */
+    rcu_reader_account(reader, rcu_get_cpu_data());
+}
+
+void
+rcu_report_periodic_event(void)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    rcu_cpu_data_check_gp_state(rcu_get_cpu_data());
+}
+
+void
+rcu_defer(struct work *work)
+{
+    struct rcu_cpu_data *cpu_data;
+    unsigned long flags;
+
+    thread_preempt_disable_intr_save(&flags);
+    cpu_data = rcu_get_cpu_data();
+    rcu_cpu_data_queue(cpu_data, work);
+    thread_preempt_enable_intr_restore(flags);
+}
+
+static void
+rcu_waiter_wakeup(struct work *work)
+{
+    struct rcu_waiter *waiter;
+
+    waiter = structof(work, struct rcu_waiter, work);
+
+    spinlock_lock(&waiter->lock);
+    waiter->done = true;
+    thread_wakeup(waiter->thread);
+    spinlock_unlock(&waiter->lock);
+}
+
+static void
+rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread)
+{
+    work_init(&waiter->work, rcu_waiter_wakeup);
+    spinlock_init(&waiter->lock);
+    waiter->thread = thread;
+    waiter->done = false;
+}
+
+static void
+rcu_waiter_wait(struct rcu_waiter *waiter)
+{
+    rcu_defer(&waiter->work);
+
+    spinlock_lock(&waiter->lock);
+
+    while (!waiter->done) {
+        thread_sleep(&waiter->lock, waiter, "rcu_wait");
+    }
+
+    spinlock_unlock(&waiter->lock);
+}
+
+void
+rcu_wait(void)
+{
+    struct rcu_waiter waiter;
+
+    rcu_waiter_init(&waiter, thread_self()),
+    rcu_waiter_wait(&waiter);
+}
+
+static int __init
+rcu_setup(void)
+{
+    rcu_data_init(&rcu_data);
+
+    for (size_t i = 0; i < cpu_count(); i++) {
+        rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i);
+    }
+
+    return 0;
+}
+
+INIT_OP_DEFINE(rcu_setup,
+               INIT_OP_DEP(clock_setup, true),
+               INIT_OP_DEP(cpu_mp_probe, true),
+               INIT_OP_DEP(spinlock_setup, true),
+               INIT_OP_DEP(sref_setup, true),
+               INIT_OP_DEP(syscnt_setup, true),
+               INIT_OP_DEP(thread_bootstrap, true),
+               INIT_OP_DEP(timer_setup, true),
+               INIT_OP_DEP(work_setup, true));
diff --git a/kern/rcu.h b/kern/rcu.h
new file mode 100644
index 00000000..1ddae7d7
--- /dev/null
+++ b/kern/rcu.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Read-Copy Update.
+ *
+ * This module provides a synchronization framework for read-only lockless
+ * operations, best used on read-mostly data structures.
+ *
+ * To learn more about RCU, see "Is Parallel Programming Hard, And, If So,
+ * What Can You Do About It ?" by Paul E. McKenney.
+ *
+ * A thorough discussion of RCU's requirements can be found at [1].
+ * However, the memory model and terminology used in that document being
+ * different from C11, the memory barrier guarantees are hereby translated
+ * into memory ordering guarantees :
+ *
+ *  1. Release-acquire ordering is enforced between the end of all
+ *     read-side critical sections starting before a grace period starts
+ *     and the start of all works deferred before the same grace period
+ *     starts.
+ *  2. Release-acquire ordering is enforced between the start of all
+ *     read-side critical sections ending after a grace period ends
+ *     and all the work deferrals done before the same grace period
+ *     starts.
+ *  3. Release-acquire ordering is enforced between all the work deferrals
+ *     done before a grace period starts and the start of all works deferred
+ *     before the same grace period starts.
+ *
+ * [1] https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.html.
+ */
+
+#ifndef _KERN_RCU_H
+#define _KERN_RCU_H
+
+#include <kern/atomic.h>
+#include <kern/init.h>
+#include <kern/rcu_i.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+
+/*
+ * Thread-local RCU data.
+ */
+struct rcu_reader;
+
+/*
+ * Safely store a pointer.
+ *
+ * This macro enforces release ordering on the given pointer.
+ */
+#define rcu_store_ptr(ptr, value) atomic_store(&(ptr), value, ATOMIC_RELEASE)
+
+/*
+ * Safely load a pointer.
+ *
+ * This macro enforces consume ordering on the given pointer.
+ */
+#define rcu_load_ptr(ptr) atomic_load(&(ptr), ATOMIC_CONSUME)
+
+/*
+ * Read-side critical section functions.
+ *
+ * Critical sections are preemptible, may safely nest, and may safely
+ * be used in interrupt context. It is not allowed to sleep inside a
+ * read-side critical section. However, it is allowed to acquire locks
+ * that don't sleep, such as spin locks.
+ *
+ * Entering or leaving a critical section doesn't provide any memory
+ * ordering guarantee.
+ */
+
+static inline void
+rcu_read_enter(void)
+{
+    rcu_reader_inc(thread_rcu_reader(thread_self()));
+    barrier();
+}
+
+static inline void
+rcu_read_leave(void)
+{
+    barrier();
+    rcu_reader_dec(thread_rcu_reader(thread_self()));
+}
+
+/*
+ * Initialize a RCU reader.
+ */
+void rcu_reader_init(struct rcu_reader *reader);
+
+/*
+ * Report a context switch on the current processor.
+ *
+ * The argument is the RCU reader of the preempted thread.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_context_switch(struct rcu_reader *reader);
+
+/*
+ * Report a periodic event on the current processor.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_periodic_event(void);
+
+/*
+ * Defer a work until all existing read-side references are dropped,
+ * without blocking.
+ *
+ * The given work is scheduled when all existing read-side references
+ * are dropped.
+ */
+void rcu_defer(struct work *work);
+
+/*
+ * Wait for all existing read-side references to be dropped.
+ *
+ * This function sleeps, and may do so for a moderately long duration,
+ * at leat a few system timer ticks, sometimes a lot more.
+ */
+void rcu_wait(void);
+
+/*
+ * This init operation provides :
+ *  - read-side critical sections may be used
+ *  - module fully initialized
+ */
+INIT_OP_DECLARE(rcu_setup);
+
+#endif /* _KERN_RCU_H */
diff --git a/kern/rcu_i.h b/kern/rcu_i.h
new file mode 100644
index 00000000..6be933db
--- /dev/null
+++ b/kern/rcu_i.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _KERN_RCU_I_H
+#define _KERN_RCU_I_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include <kern/macros.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+
+void rcu_reader_leave(struct rcu_reader *reader);
+
+static inline bool
+rcu_reader_in_cs(const struct rcu_reader *reader)
+{
+    return reader->level != 0;
+}
+
+static inline bool
+rcu_reader_linked(const struct rcu_reader *reader)
+{
+    assert(reader == thread_rcu_reader(thread_self()));
+    return reader->linked;
+}
+
+static inline void
+rcu_reader_inc(struct rcu_reader *reader)
+{
+    reader->level++;
+    assert(reader->level != 0);
+}
+
+static inline void
+rcu_reader_dec(struct rcu_reader *reader)
+{
+    assert(reader->level != 0);
+    reader->level--;
+
+    if (unlikely(!rcu_reader_in_cs(reader) && rcu_reader_linked(reader))) {
+        rcu_reader_leave(reader);
+    }
+}
+
+#endif /* _KERN_RCU_I_H */
diff --git a/kern/rcu_types.h b/kern/rcu_types.h
new file mode 100644
index 00000000..12b8913f
--- /dev/null
+++ b/kern/rcu_types.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_RCU_TYPES_H
+#define _KERN_RCU_TYPES_H
+
+#include <stdbool.h>
+
+/*
+ * Thread-local data used to track threads running read-side critical
+ * sections.
+ *
+ * The window ID is valid if and only if the reader is linked.
+ *
+ * Interrupts and preemption must be disabled when accessing a reader.
+ */
+struct rcu_reader {
+    unsigned int level;
+    unsigned int wid;
+    bool linked;
+};
+
+#endif /* _KERN_RCU_TYPES_H */
diff --git a/kern/thread.c b/kern/thread.c
index 63af1bde..8460f3bc 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -101,6 +101,7 @@
 #include <kern/macros.h>
 #include <kern/panic.h>
 #include <kern/percpu.h>
+#include <kern/rcu.h>
 #include <kern/shell.h>
 #include <kern/sleepq.h>
 #include <kern/spinlock.h>
@@ -641,6 +642,8 @@ thread_runq_schedule(struct thread_runq *runq)
     assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
 
     if (likely(prev != next)) {
+        rcu_report_context_switch(thread_rcu_reader(prev));
+
         /*
          * That's where the true context switch occurs. The next thread must
          * unlock the run queue and reenable preemption. Note that unlocking
@@ -1700,6 +1703,7 @@ thread_init_booter(unsigned int cpu)
     booter->flags = 0;
     booter->intr_level = 0;
     booter->preempt_level = 1;
+    rcu_reader_init(&booter->rcu_reader);
     cpumap_fill(&booter->cpumap);
     thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
     thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
@@ -1828,6 +1832,7 @@ thread_init(struct thread *thread, void *stack,
     thread->pin_level = 0;
     thread->intr_level = 0;
     thread->llsync_level = 0;
+    rcu_reader_init(&thread->rcu_reader);
     cpumap_copy(&thread->cpumap, cpumap);
     thread_set_user_sched_policy(thread, attr->policy);
     thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
diff --git a/kern/thread.h b/kern/thread.h
index 58322bda..c4dbd5d1 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -678,6 +678,16 @@ thread_llsync_read_dec(void)
     thread->llsync_level--;
 }
 
+/*
+ * RCU functions.
+ */
+
+static inline struct rcu_reader *
+thread_rcu_reader(struct thread *thread)
+{
+    return &thread->rcu_reader;
+}
+
 /*
  * Type for thread-specific data destructor.
  */
diff --git a/kern/thread_i.h b/kern/thread_i.h
index 96ce100a..979d4b0f 100644
--- a/kern/thread_i.h
+++ b/kern/thread_i.h
@@ -24,6 +24,7 @@
 #include <kern/atomic.h>
 #include <kern/cpumap.h>
 #include <kern/list_types.h>
+#include <kern/rcu_types.h>
 #include <kern/spinlock_types.h>
 #include <kern/turnstile_types.h>
 #include <machine/cpu.h>
@@ -137,6 +138,8 @@ struct thread {
     /* Read-side critical section level, not in any if 0 */
     unsigned short llsync_level; /* (-) */
 
+    struct rcu_reader rcu_reader;   /* (-) */
+
     /* Processors on which this thread is allowed to run */
     struct cpumap cpumap;   /* (r) */
 
diff --git a/test/Kconfig b/test/Kconfig
index 8f3ae8ba..6a6221bb 100644
--- a/test/Kconfig
+++ b/test/Kconfig
@@ -22,6 +22,9 @@ config TEST_MODULE_MUTEX_PI
 config TEST_MODULE_PMAP_UPDATE_MP
 	bool "pmap_update_mp"
 
+config TEST_MODULE_RCU_DEFER
+	bool "rcu_defer"
+
 config TEST_MODULE_SREF_DIRTY_ZEROES
 	bool "sref_dirty_zeroes"
 
diff --git a/test/Makefile b/test/Makefile
index 098cecbd..b7ac3d7f 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -2,6 +2,7 @@ x15_SOURCES-$(CONFIG_TEST_MODULE_LLSYNC_DEFER)          += test/test_llsync_defe
 x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX)                 += test/test_mutex.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX_PI)              += test/test_mutex_pi.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_PMAP_UPDATE_MP)        += test/test_pmap_update_mp.c
+x15_SOURCES-$(CONFIG_TEST_MODULE_RCU_DEFER)             += test/test_rcu_defer.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_DIRTY_ZEROES)     += test/test_sref_dirty_zeroes.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_NOREF)            += test/test_sref_noref.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_WEAKREF)          += test/test_sref_weakref.c
diff --git a/test/test_rcu_defer.c b/test/test_rcu_defer.c
new file mode 100644
index 00000000..e1ab0e51
--- /dev/null
+++ b/test/test_rcu_defer.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2014-2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This test module is a stress test, expected to never terminate, of the
+ * work deferring functionality of the rcu module. It creates three
+ * threads, a producer, a consumer, and a reader. The producer allocates
+ * a page and writes it. It then transfers the page to the consumer, using
+ * the rcu interface to update the global page pointer. Once at the
+ * consumer, the rcu interface is used to defer the release of the page.
+ * Concurrently, the reader accesses the page and checks its content when
+ * available. These accesses are performed inside a read-side critical
+ * section and should therefore never fail.
+ *
+ * Each thread regularly prints a string to report that it's making progress.
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <kern/condition.h>
+#include <kern/cpumap.h>
+#include <kern/error.h>
+#include <kern/kmem.h>
+#include <kern/macros.h>
+#include <kern/mutex.h>
+#include <kern/panic.h>
+#include <kern/rcu.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+#include <machine/page.h>
+#include <test/test.h>
+#include <vm/vm_kmem.h>
+
+#define TEST_LOOPS_PER_PRINT 100000
+
+struct test_pdsc {
+    struct work work;
+    void *addr;
+};
+
+#define TEST_VALIDATION_BYTE 0xab
+
+static struct mutex test_lock;
+static struct condition test_condition;
+static struct test_pdsc *test_pdsc;
+
+static struct kmem_cache test_pdsc_cache;
+
+static void
+test_alloc(void *arg)
+{
+    struct test_pdsc *pdsc;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    mutex_lock(&test_lock);
+
+    for (;;) {
+        while (test_pdsc != NULL) {
+            condition_wait(&test_condition, &test_lock);
+        }
+
+        pdsc = kmem_cache_alloc(&test_pdsc_cache);
+
+        if (pdsc != NULL) {
+            pdsc->addr = vm_kmem_alloc(PAGE_SIZE);
+
+            if (pdsc->addr != NULL) {
+                memset(pdsc->addr, TEST_VALIDATION_BYTE, PAGE_SIZE);
+            }
+        }
+
+        rcu_store_ptr(test_pdsc, pdsc);
+        condition_signal(&test_condition);
+
+        if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+            printf("alloc ");
+        }
+
+        nr_loops++;
+    }
+}
+
+static void
+test_deferred_free(struct work *work)
+{
+    struct test_pdsc *pdsc;
+
+    pdsc = structof(work, struct test_pdsc, work);
+
+    if (pdsc->addr != NULL) {
+        vm_kmem_free(pdsc->addr, PAGE_SIZE);
+    }
+
+    kmem_cache_free(&test_pdsc_cache, pdsc);
+}
+
+static void
+test_free(void *arg)
+{
+    struct test_pdsc *pdsc;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    mutex_lock(&test_lock);
+
+    for (;;) {
+        while (test_pdsc == NULL) {
+            condition_wait(&test_condition, &test_lock);
+        }
+
+        pdsc = test_pdsc;
+        rcu_store_ptr(test_pdsc, NULL);
+
+        if (pdsc != NULL) {
+            work_init(&pdsc->work, test_deferred_free);
+            rcu_defer(&pdsc->work);
+        }
+
+        condition_signal(&test_condition);
+
+        if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+            printf("free ");
+        }
+
+        nr_loops++;
+    }
+}
+
+static void
+test_read(void *arg)
+{
+    const struct test_pdsc *pdsc;
+    const unsigned char *s;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    for (;;) {
+        rcu_read_enter();
+
+        pdsc = rcu_load_ptr(test_pdsc);
+
+        if (pdsc != NULL) {
+            s = (const unsigned char *)pdsc->addr;
+
+            if (s != NULL) {
+                for (unsigned int i = 0; i < PAGE_SIZE; i++) {
+                    if (s[i] != TEST_VALIDATION_BYTE) {
+                        panic("invalid content");
+                    }
+                }
+
+                if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+                    printf("read ");
+                }
+
+                nr_loops++;
+            }
+        }
+
+        rcu_read_leave();
+    }
+}
+
+void
+test_setup(void)
+{
+    struct thread_attr attr;
+    struct thread *thread;
+    struct cpumap *cpumap;
+    int error;
+
+    condition_init(&test_condition);
+    mutex_init(&test_lock);
+
+    error = cpumap_create(&cpumap);
+    error_check(error, "cpumap_create");
+
+    kmem_cache_init(&test_pdsc_cache, "test_pdsc",
+                    sizeof(struct test_pdsc), 0, NULL, 0);
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_alloc");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, cpu_count() - 1);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_alloc, NULL);
+    error_check(error, "thread_create");
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_free");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, cpu_count() - 1);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_free, NULL);
+    error_check(error, "thread_create");
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_read");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 0);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_read, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_destroy(cpumap);
+}
-- 
cgit v1.2.3