Merge branch 'preemptible_rcu'

author: Richard Braun <rbraun@sceen.net> 2018-02-20 20:46:37 +0100
committer: Richard Braun <rbraun@sceen.net> 2018-02-20 20:46:37 +0100
commit: da8eb9c244d27fd042adde6234ccec079681d7f4 (patch)
tree: f0de4b67e3a59dded6598efe3db87c835da72b92
parent: ed9ada07d9c841b3f3f3a3f841aae2792a30779a (diff)
parent: 22dff6b7a6839e77d713d671cb2038e56b64ac16 (diff)
12 files changed, 1275 insertions, 11 deletions
diff --git a/kern/Makefile b/kern/Makefile
index 0aa96fc..6c0778b 100644
--- a/kern/Makefile
+++ b/kern/Makefile
@@ -20,6 +20,7 @@ x15_SOURCES-y += \
         kern/plist.c \
         kern/printf.c \
         kern/rbtree.c \
+        kern/rcu.c \
         kern/rdxtree.c \
         kern/rtmutex.c \
         kern/semaphore.c \
diff --git a/kern/clock.c b/kern/clock.c
index 19f823c..c09ffdd 100644
--- a/kern/clock.c
+++ b/kern/clock.c
@@ -24,6 +24,7 @@
 #include <kern/init.h>
 #include <kern/llsync.h>
 #include <kern/percpu.h>
+#include <kern/rcu.h>
 #include <kern/sref.h>
 #include <kern/syscnt.h>
 #include <kern/thread.h>
@@ -91,6 +92,7 @@ void clock_tick_intr(void)
 
     timer_report_periodic_event();
     llsync_report_periodic_event();
+    rcu_report_periodic_event();
     sref_report_periodic_event();
     work_report_periodic_event();
     thread_report_periodic_event();
diff --git a/kern/rcu.c b/kern/rcu.c
new file mode 100644
index 0000000..82c62df
--- /dev/null
+++ b/kern/rcu.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This implementation is based on the paper "Extending RCU for Realtime
+ * and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma,
+ * and Suparna Bhattacharya. Beside the mechanisms not implemented yet,
+ * such as priority boosting, the differences are described below.
+ *
+ * First, this implementation uses scalable reference counters provided
+ * by the sref module instead of per-CPU counters as described in the paper.
+ * The main benefit of this approach is the centralization of most scalability
+ * improvements in the sref module, which should propagate to all sref users,
+ * including RCU.
+ *
+ * In addition, this implementation introduces the concept of windows, where
+ * a window is a range in time to which readers may be linked. Here, a
+ * grace period is defined as the time range at the end of a window where
+ * various synchronization steps are performed to enforce the RCU guarantees.
+ * The minimum duration of a window acts as a knob allowing users to tune
+ * the behavior of the RCU system.
+ *
+ * Finally, the state machine described in the paper is updated to accommodate
+ * for windows, since grace periods don't run back-to-back to each other.
+ * Windows are regularly checked and flipped if the previous one isn't
+ * active any more. From that moment, processors may notice the global flip
+ * and perform a local flip of their work window ID. Once all processors
+ * have acknowleged the flip, it is certain that no new work may be queued
+ * on the previous window. At this point, the same occurs for the
+ * processor-local reader window ID, and once all processors have
+ * acknowleged that flip, there can be no new reader linked to the previous
+ * window. The RCU system then releases its own reference to the previous
+ * window and waits for the window reference counter to drop to 0, indicating
+ * that all readers linked to the previous window have left their read-side
+ * critical section. When this global event occurs, processors are requested
+ * to flush the works queued for the previous window, and once they all have
+ * acknowleged their flush, the window ends and becomes inactive, allowing
+ * a new grace period to occur later on.
+ *
+ * Here is an informal diagram describing this process :
+ *
+ * t ---->
+ *
+ *    reader window flip ---+     +--- no more readers
+ * work window flip ------+ |     | +- works flushed
+ * (grace period start)   | |     | |  (grace period / window end)
+ *                        v v     v v
+ *         +--------------+-+-----+-+
+ *         |              . .     . |
+ *         |  window 0    . . gp  . |
+ *         |      removal . .     . | reclamation
+ *         +--------------+-+-----+-+-----+----+
+ *                        |               .    |
+ *                        |  window 1     . gp |
+ *                        |       removal .    | reclamation
+ *                        +---------------+----+--------
+ *                                        |
+ *                                        |  window 2   ...
+ *                                        |
+ *                                        +-------------
+ *
+ * TODO Improve atomic acknowledgment scalability.
+ * TODO Handle large amounts of deferred works.
+ * TODO Priority boosting of slow readers.
+ * TODO CPU registration for dyntick-friendly behavior.
+ */
+
+#include <assert.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <kern/atomic.h>
+#include <kern/clock.h>
+#include <kern/init.h>
+#include <kern/macros.h>
+#include <kern/rcu.h>
+#include <kern/panic.h>
+#include <kern/percpu.h>
+#include <kern/spinlock.h>
+#include <kern/sref.h>
+#include <kern/syscnt.h>
+#include <kern/thread.h>
+#include <kern/timer.h>
+#include <kern/work.h>
+#include <machine/cpu.h>
+
+/*
+ * Negative close to 0 so that an overflow occurs early.
+ */
+#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500)
+
+/*
+ * Interval between window checking.
+ *
+ * When windows are checked, a flip occurs if the previous window isn't
+ * active any more.
+ */
+#define RCU_WINDOW_CHECK_INTERVAL_MS 10
+
+/*
+ * Grace period states.
+ *
+ * These states are only used to trigger per-CPU processing that is
+ * globally acknowleged by decrementing a global atomic counter. They
+ * do not completely represent the actual state of a grace period.
+ */
+enum rcu_gp_state {
+    RCU_GP_STATE_WORK_WINDOW_FLIP,
+    RCU_GP_STATE_READER_WINDOW_FLIP,
+    RCU_GP_STATE_WORK_FLUSH,
+};
+
+/*
+ * Per-CPU view of a window.
+ *
+ * Deferred works are scheduled when the window ends.
+ */
+struct rcu_cpu_window {
+    struct work_queue works;
+};
+
+/*
+ * Per-CPU RCU data.
+ *
+ * Each processor maintains two local window IDs. One is used as the current
+ * window ID when deferring work, the other when detecting a reader. A local
+ * flip occurs when a processor notices that the global grace period state
+ * no longer matches the local grace period state. These checks only occur
+ * on periodic events.
+ *
+ * Interrupts and preemption must be disabled when accessing local CPU data.
+ */
+struct rcu_cpu_data {
+    enum rcu_gp_state gp_state;
+    unsigned int work_wid;
+    unsigned int reader_wid;
+    struct rcu_cpu_window windows[2];
+    struct syscnt sc_nr_detected_readers;
+};
+
+/*
+ * Global window.
+ *
+ * A window is a time range that tracks read-side references. Conceptually,
+ * each reader adds a reference to the current window. In practice, references
+ * are only added when readers are detected, which occurs on a context switch
+ * (to track preempted threads) or a reader window flip (to prevent currently
+ * running readers to be linked to the next window).
+ *
+ * When a window is started, its scalable reference counter is initialized
+ * with a reference owned by the RCU system. That reference guarantees that
+ * the window remains active as long as new readers may add references,
+ * since it prevents the counter from dropping to 0. After a reader window
+ * flip, there may not be new references to the window, and the initial
+ * reference is dropped, allowing the counter to reach 0 once all detected
+ * readers leave their critical section and unreference the window they're
+ * linked to.
+ */
+struct rcu_window {
+    struct sref_counter nr_refs;
+    uint64_t start_ts;
+    bool active;
+};
+
+/*
+ * Global data.
+ *
+ * Processors regularly check the grace period state against their own,
+ * locally cached grace period state, and take action whenever they differ.
+ * False sharing is avoided by making the global grace period state fill an
+ * entire cache line on SMP.
+ *
+ * After processors notice a grace period state change, they acknowledge
+ * noticing this change by decrementing the atomic acknowledgment counter,
+ * which also fills a complete cache line on SMP in order to restrict cache
+ * line bouncing. Atomic operations on this counter are done with
+ * acquire-release ordering to enforce the memory ordering guarantees
+ * required by the implementation, as well as those provided by the public
+ * interface.
+ *
+ * In addition to the global window ID and the windows themselves, the data
+ * include a timer, used to trigger the end of windows, i.e. grace periods.
+ * Since the timer function, atomic acknowledgments, and window no-reference
+ * function chain each other, there is currently no need for a global lock.
+ */
+struct rcu_data {
+    struct {
+        alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state;
+    };
+    struct {
+        alignas(CPU_L1_SIZE) unsigned int nr_acks;
+    };
+
+    unsigned int wid;
+    struct rcu_window windows[2];
+    struct timer timer;
+    struct syscnt sc_nr_windows;
+    struct syscnt sc_last_window_ms;
+    struct syscnt sc_longest_window_ms;
+};
+
+/*
+ * Structure used to implement rcu_wait().
+ */
+struct rcu_waiter {
+    struct work work;
+    struct spinlock lock;
+    struct thread *thread;
+    bool done;
+};
+
+static struct rcu_data rcu_data;
+static struct rcu_cpu_data rcu_cpu_data __percpu;
+
+static struct rcu_cpu_data *
+rcu_get_cpu_data(void)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    return cpu_local_ptr(rcu_cpu_data);
+}
+
+static enum rcu_gp_state
+rcu_data_get_gp_state(const struct rcu_data *data)
+{
+    return data->gp_state;
+}
+
+static unsigned int
+rcu_data_get_wid(const struct rcu_data *data)
+{
+    return data->wid;
+}
+
+static struct rcu_window *
+rcu_data_get_window_from_index(struct rcu_data *data, size_t index)
+{
+    assert(index < ARRAY_SIZE(data->windows));
+    return &data->windows[index];
+}
+
+static struct rcu_window *
+rcu_data_get_window(struct rcu_data *data, unsigned int wid)
+{
+    return rcu_data_get_window_from_index(data, wid & 1);
+}
+
+static void
+rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state)
+{
+    assert(data->nr_acks  == 0);
+
+    switch (gp_state) {
+    case RCU_GP_STATE_WORK_WINDOW_FLIP:
+        assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH);
+        break;
+    case RCU_GP_STATE_READER_WINDOW_FLIP:
+        assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP);
+        break;
+    case RCU_GP_STATE_WORK_FLUSH:
+        assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP);
+        break;
+    default:
+        panic("rcu: invalid grace period state");
+    }
+
+    data->nr_acks = cpu_count();
+    atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE);
+}
+
+static bool
+rcu_data_check_gp_state(const struct rcu_data *data,
+                        enum rcu_gp_state local_gp_state,
+                        enum rcu_gp_state *global_gp_state)
+{
+    *global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED);
+
+    if (unlikely(local_gp_state != *global_gp_state)) {
+        atomic_fence_acquire();
+        return true;
+    }
+
+    return false;
+}
+
+static void
+rcu_window_end(struct rcu_window *window)
+{
+    assert(window->active);
+    window->active = false;
+}
+
+static void
+rcu_window_ref(struct rcu_window *window)
+{
+    sref_counter_inc(&window->nr_refs);
+}
+
+static void
+rcu_window_unref(struct rcu_window *window)
+{
+    sref_counter_dec(&window->nr_refs);
+}
+
+static uint64_t
+rcu_window_get_start_ts(const struct rcu_window *window)
+{
+    return window->start_ts;
+}
+
+static void
+rcu_window_flush(struct sref_counter *counter)
+{
+    (void)counter;
+
+    rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH);
+}
+
+static void __init
+rcu_window_init(struct rcu_window *window)
+{
+    window->active = false;
+}
+
+static void
+rcu_window_start(struct rcu_window *window)
+{
+    assert(!window->active);
+
+    sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush);
+    window->start_ts = clock_get_time();
+    window->active = true;
+}
+
+static bool
+rcu_window_active(const struct rcu_window *window)
+{
+    return window->active;
+}
+
+static void
+rcu_data_end_prev_window(struct rcu_data *data, uint64_t now)
+{
+    struct rcu_window *window;
+    uint64_t duration;
+
+    window = rcu_data_get_window(data, data->wid - 1);
+    duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window));
+    syscnt_set(&data->sc_last_window_ms, duration);
+
+    if (duration > syscnt_read(&data->sc_longest_window_ms)) {
+        syscnt_set(&data->sc_longest_window_ms, duration);
+    }
+
+    rcu_window_end(window);
+}
+
+static void
+rcu_data_schedule_timer(struct rcu_data *data, uint64_t now)
+{
+    uint64_t ticks;
+
+    ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS);
+    timer_schedule(&data->timer, now + ticks);
+}
+
+static void
+rcu_data_ack_cpu(struct rcu_data *data)
+{
+    struct rcu_window *window;
+    unsigned int prev_nr_acks;
+    uint64_t now;
+
+    prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL);
+
+    if (prev_nr_acks != 1) {
+        assert(prev_nr_acks != 0);
+        return;
+    }
+
+    switch (data->gp_state) {
+    case RCU_GP_STATE_WORK_WINDOW_FLIP:
+        rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP);
+        break;
+    case RCU_GP_STATE_READER_WINDOW_FLIP:
+        window = rcu_data_get_window(data, data->wid - 1);
+        rcu_window_unref(window);
+        break;
+    case RCU_GP_STATE_WORK_FLUSH:
+        now = clock_get_time();
+        rcu_data_end_prev_window(data, now);
+        rcu_data_schedule_timer(data, now);
+        break;
+    default:
+        panic("rcu: invalid grace period state");
+    }
+}
+
+static bool
+rcu_data_flip_windows(struct rcu_data *data)
+{
+    struct rcu_window *window;
+
+    window = rcu_data_get_window(data, data->wid - 1);
+
+    if (rcu_window_active(window)) {
+        return false;
+    }
+
+    rcu_window_start(window);
+    syscnt_inc(&data->sc_nr_windows);
+    data->wid++;
+    rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP);
+    return true;
+}
+
+static void
+rcu_data_check_windows(struct timer *timer)
+{
+    struct rcu_data *data;
+    bool flipped;
+
+    data = &rcu_data;
+    flipped = rcu_data_flip_windows(data);
+
+    if (!flipped) {
+        rcu_data_schedule_timer(data, timer_get_time(timer));
+    }
+}
+
+static void __init
+rcu_data_init(struct rcu_data *data)
+{
+    data->gp_state = RCU_GP_STATE_WORK_FLUSH;
+    data->nr_acks = 0;
+    data->wid = RCU_WINDOW_ID_INIT_VALUE;
+
+    for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) {
+        rcu_window_init(rcu_data_get_window_from_index(data, i));
+    }
+
+    rcu_window_start(rcu_data_get_window(data, data->wid));
+
+    timer_init(&data->timer, rcu_data_check_windows, 0);
+    rcu_data_schedule_timer(data, clock_get_time());
+
+    syscnt_register(&data->sc_nr_windows, "rcu_nr_windows");
+    syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms");
+    syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms");
+}
+
+static void __init
+rcu_cpu_window_init(struct rcu_cpu_window *cpu_window)
+{
+    work_queue_init(&cpu_window->works);
+}
+
+static void
+rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work)
+{
+    work_queue_push(&cpu_window->works, work);
+}
+
+static void
+rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window)
+{
+    work_queue_schedule(&cpu_window->works, 0);
+    work_queue_init(&cpu_window->works);
+}
+
+static unsigned int
+rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data)
+{
+    return cpu_data->reader_wid;
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index)
+{
+    assert(index < ARRAY_SIZE(cpu_data->windows));
+    return &cpu_data->windows[index];
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid)
+{
+    return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1);
+}
+
+static void __init
+rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu)
+{
+    struct rcu_data *data;
+    char name[SYSCNT_NAME_SIZE];
+
+    data = &rcu_data;
+
+    cpu_data->gp_state = rcu_data_get_gp_state(data);
+    cpu_data->work_wid = rcu_data_get_wid(data);
+    cpu_data->reader_wid = cpu_data->work_wid;
+
+    for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) {
+        rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i));
+    }
+
+    snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu);
+    syscnt_register(&cpu_data->sc_nr_detected_readers, name);
+}
+
+static void
+rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work)
+{
+    struct rcu_cpu_window *cpu_window;
+
+    cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid);
+    rcu_cpu_window_queue(cpu_window, work);
+}
+
+static void
+rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data)
+{
+    struct rcu_cpu_window *cpu_window;
+
+    assert(cpu_data->work_wid == cpu_data->reader_wid);
+
+    cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1);
+    rcu_cpu_window_flush(cpu_window);
+}
+
+void
+rcu_reader_init(struct rcu_reader *reader)
+{
+    reader->level = 0;
+    reader->linked = false;
+}
+
+static void
+rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(reader == thread_rcu_reader(thread_self()));
+    assert(!rcu_reader_linked(reader));
+
+    reader->wid = rcu_cpu_data_get_reader_wid(cpu_data);
+    reader->linked = true;
+}
+
+static void
+rcu_reader_unlink(struct rcu_reader *reader)
+{
+    assert(reader->level == 0);
+    reader->linked = false;
+}
+
+static void
+rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    struct rcu_window *window;
+    struct rcu_data *data;
+    unsigned int wid;
+
+    if (rcu_reader_linked(reader)) {
+        return;
+    }
+
+    data = &rcu_data;
+    wid = rcu_cpu_data_get_reader_wid(cpu_data);
+    window = rcu_data_get_window(data, wid);
+
+    rcu_reader_link(reader, cpu_data);
+    rcu_window_ref(window);
+
+    syscnt_inc(&cpu_data->sc_nr_detected_readers);
+}
+
+void
+rcu_reader_leave(struct rcu_reader *reader)
+{
+    struct rcu_window *window;
+    struct rcu_data *data;
+
+    data = &rcu_data;
+
+    window = rcu_data_get_window(data, reader->wid);
+    rcu_window_unref(window);
+    rcu_reader_unlink(reader);
+}
+
+static void
+rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+    if (rcu_reader_in_cs(reader)) {
+        rcu_reader_enter(reader, cpu_data);
+    }
+}
+
+static void
+rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    cpu_data->work_wid++;
+}
+
+static void
+rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data);
+    cpu_data->reader_wid++;
+}
+
+static void
+rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data)
+{
+    enum rcu_gp_state local_gp_state, global_gp_state;
+    struct rcu_data *data;
+    bool diff;
+
+    data = &rcu_data;
+
+    for (;;) {
+        local_gp_state = cpu_data->gp_state;
+        diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state);
+
+        if (!diff) {
+            break;
+        }
+
+        switch (global_gp_state) {
+        case RCU_GP_STATE_WORK_WINDOW_FLIP:
+            rcu_cpu_data_flip_work_wid(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        case RCU_GP_STATE_READER_WINDOW_FLIP:
+            rcu_cpu_data_flip_reader_wid(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        case RCU_GP_STATE_WORK_FLUSH:
+            rcu_cpu_data_flush(cpu_data);
+            rcu_data_ack_cpu(data);
+            break;
+        default:
+            panic("rcu: invalid grace period state");
+        }
+
+        cpu_data->gp_state = global_gp_state;
+    }
+}
+
+void
+rcu_report_context_switch(struct rcu_reader *reader)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    /*
+     * Most readers don't need to be accounted for because their execution
+     * doesn't overlap with a grace period. If a reader is preempted however,
+     * it must be accounted in case a grace period starts while the reader
+     * is preempted. Accounting also occurs when a grace period starts, and
+     * more exactly, when the reader window ID of a processor is flipped.
+     */
+    rcu_reader_account(reader, rcu_get_cpu_data());
+}
+
+void
+rcu_report_periodic_event(void)
+{
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    rcu_cpu_data_check_gp_state(rcu_get_cpu_data());
+}
+
+void
+rcu_defer(struct work *work)
+{
+    struct rcu_cpu_data *cpu_data;
+    unsigned long flags;
+
+    thread_preempt_disable_intr_save(&flags);
+    cpu_data = rcu_get_cpu_data();
+    rcu_cpu_data_queue(cpu_data, work);
+    thread_preempt_enable_intr_restore(flags);
+}
+
+static void
+rcu_waiter_wakeup(struct work *work)
+{
+    struct rcu_waiter *waiter;
+
+    waiter = structof(work, struct rcu_waiter, work);
+
+    spinlock_lock(&waiter->lock);
+    waiter->done = true;
+    thread_wakeup(waiter->thread);
+    spinlock_unlock(&waiter->lock);
+}
+
+static void
+rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread)
+{
+    work_init(&waiter->work, rcu_waiter_wakeup);
+    spinlock_init(&waiter->lock);
+    waiter->thread = thread;
+    waiter->done = false;
+}
+
+static void
+rcu_waiter_wait(struct rcu_waiter *waiter)
+{
+    rcu_defer(&waiter->work);
+
+    spinlock_lock(&waiter->lock);
+
+    while (!waiter->done) {
+        thread_sleep(&waiter->lock, waiter, "rcu_wait");
+    }
+
+    spinlock_unlock(&waiter->lock);
+}
+
+void
+rcu_wait(void)
+{
+    struct rcu_waiter waiter;
+
+    rcu_waiter_init(&waiter, thread_self()),
+    rcu_waiter_wait(&waiter);
+}
+
+static int __init
+rcu_setup(void)
+{
+    rcu_data_init(&rcu_data);
+
+    for (size_t i = 0; i < cpu_count(); i++) {
+        rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i);
+    }
+
+    return 0;
+}
+
+INIT_OP_DEFINE(rcu_setup,
+               INIT_OP_DEP(clock_setup, true),
+               INIT_OP_DEP(cpu_mp_probe, true),
+               INIT_OP_DEP(spinlock_setup, true),
+               INIT_OP_DEP(sref_setup, true),
+               INIT_OP_DEP(syscnt_setup, true),
+               INIT_OP_DEP(thread_bootstrap, true),
+               INIT_OP_DEP(timer_setup, true),
+               INIT_OP_DEP(work_setup, true));
diff --git a/kern/rcu.h b/kern/rcu.h
new file mode 100644
index 0000000..1ddae7d
--- /dev/null
+++ b/kern/rcu.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Read-Copy Update.
+ *
+ * This module provides a synchronization framework for read-only lockless
+ * operations, best used on read-mostly data structures.
+ *
+ * To learn more about RCU, see "Is Parallel Programming Hard, And, If So,
+ * What Can You Do About It ?" by Paul E. McKenney.
+ *
+ * A thorough discussion of RCU's requirements can be found at [1].
+ * However, the memory model and terminology used in that document being
+ * different from C11, the memory barrier guarantees are hereby translated
+ * into memory ordering guarantees :
+ *
+ *  1. Release-acquire ordering is enforced between the end of all
+ *     read-side critical sections starting before a grace period starts
+ *     and the start of all works deferred before the same grace period
+ *     starts.
+ *  2. Release-acquire ordering is enforced between the start of all
+ *     read-side critical sections ending after a grace period ends
+ *     and all the work deferrals done before the same grace period
+ *     starts.
+ *  3. Release-acquire ordering is enforced between all the work deferrals
+ *     done before a grace period starts and the start of all works deferred
+ *     before the same grace period starts.
+ *
+ * [1] https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.html.
+ */
+
+#ifndef _KERN_RCU_H
+#define _KERN_RCU_H
+
+#include <kern/atomic.h>
+#include <kern/init.h>
+#include <kern/rcu_i.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+
+/*
+ * Thread-local RCU data.
+ */
+struct rcu_reader;
+
+/*
+ * Safely store a pointer.
+ *
+ * This macro enforces release ordering on the given pointer.
+ */
+#define rcu_store_ptr(ptr, value) atomic_store(&(ptr), value, ATOMIC_RELEASE)
+
+/*
+ * Safely load a pointer.
+ *
+ * This macro enforces consume ordering on the given pointer.
+ */
+#define rcu_load_ptr(ptr) atomic_load(&(ptr), ATOMIC_CONSUME)
+
+/*
+ * Read-side critical section functions.
+ *
+ * Critical sections are preemptible, may safely nest, and may safely
+ * be used in interrupt context. It is not allowed to sleep inside a
+ * read-side critical section. However, it is allowed to acquire locks
+ * that don't sleep, such as spin locks.
+ *
+ * Entering or leaving a critical section doesn't provide any memory
+ * ordering guarantee.
+ */
+
+static inline void
+rcu_read_enter(void)
+{
+    rcu_reader_inc(thread_rcu_reader(thread_self()));
+    barrier();
+}
+
+static inline void
+rcu_read_leave(void)
+{
+    barrier();
+    rcu_reader_dec(thread_rcu_reader(thread_self()));
+}
+
+/*
+ * Initialize a RCU reader.
+ */
+void rcu_reader_init(struct rcu_reader *reader);
+
+/*
+ * Report a context switch on the current processor.
+ *
+ * The argument is the RCU reader of the preempted thread.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_context_switch(struct rcu_reader *reader);
+
+/*
+ * Report a periodic event on the current processor.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_periodic_event(void);
+
+/*
+ * Defer a work until all existing read-side references are dropped,
+ * without blocking.
+ *
+ * The given work is scheduled when all existing read-side references
+ * are dropped.
+ */
+void rcu_defer(struct work *work);
+
+/*
+ * Wait for all existing read-side references to be dropped.
+ *
+ * This function sleeps, and may do so for a moderately long duration,
+ * at leat a few system timer ticks, sometimes a lot more.
+ */
+void rcu_wait(void);
+
+/*
+ * This init operation provides :
+ *  - read-side critical sections may be used
+ *  - module fully initialized
+ */
+INIT_OP_DECLARE(rcu_setup);
+
+#endif /* _KERN_RCU_H */
diff --git a/kern/rcu_i.h b/kern/rcu_i.h
new file mode 100644
index 0000000..6be933d
--- /dev/null
+++ b/kern/rcu_i.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _KERN_RCU_I_H
+#define _KERN_RCU_I_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include <kern/macros.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+
+void rcu_reader_leave(struct rcu_reader *reader);
+
+static inline bool
+rcu_reader_in_cs(const struct rcu_reader *reader)
+{
+    return reader->level != 0;
+}
+
+static inline bool
+rcu_reader_linked(const struct rcu_reader *reader)
+{
+    assert(reader == thread_rcu_reader(thread_self()));
+    return reader->linked;
+}
+
+static inline void
+rcu_reader_inc(struct rcu_reader *reader)
+{
+    reader->level++;
+    assert(reader->level != 0);
+}
+
+static inline void
+rcu_reader_dec(struct rcu_reader *reader)
+{
+    assert(reader->level != 0);
+    reader->level--;
+
+    if (unlikely(!rcu_reader_in_cs(reader) && rcu_reader_linked(reader))) {
+        rcu_reader_leave(reader);
+    }
+}
+
+#endif /* _KERN_RCU_I_H */
diff --git a/kern/rcu_types.h b/kern/rcu_types.h
new file mode 100644
index 0000000..12b8913
--- /dev/null
+++ b/kern/rcu_types.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_RCU_TYPES_H
+#define _KERN_RCU_TYPES_H
+
+#include <stdbool.h>
+
+/*
+ * Thread-local data used to track threads running read-side critical
+ * sections.
+ *
+ * The window ID is valid if and only if the reader is linked.
+ *
+ * Interrupts and preemption must be disabled when accessing a reader.
+ */
+struct rcu_reader {
+    unsigned int level;
+    unsigned int wid;
+    bool linked;
+};
+
+#endif /* _KERN_RCU_TYPES_H */
diff --git a/kern/thread.c b/kern/thread.c
index 20378ff..8460f3b 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -101,6 +101,7 @@
 #include <kern/macros.h>
 #include <kern/panic.h>
 #include <kern/percpu.h>
+#include <kern/rcu.h>
 #include <kern/shell.h>
 #include <kern/sleepq.h>
 #include <kern/spinlock.h>
@@ -641,6 +642,8 @@ thread_runq_schedule(struct thread_runq *runq)
     assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
 
     if (likely(prev != next)) {
+        rcu_report_context_switch(thread_rcu_reader(prev));
+
         /*
          * That's where the true context switch occurs. The next thread must
          * unlock the run queue and reenable preemption. Note that unlocking
@@ -1700,6 +1703,7 @@ thread_init_booter(unsigned int cpu)
     booter->flags = 0;
     booter->intr_level = 0;
     booter->preempt_level = 1;
+    rcu_reader_init(&booter->rcu_reader);
     cpumap_fill(&booter->cpumap);
     thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
     thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
@@ -1828,6 +1832,7 @@ thread_init(struct thread *thread, void *stack,
     thread->pin_level = 0;
     thread->intr_level = 0;
     thread->llsync_level = 0;
+    rcu_reader_init(&thread->rcu_reader);
     cpumap_copy(&thread->cpumap, cpumap);
     thread_set_user_sched_policy(thread, attr->policy);
     thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
@@ -2108,7 +2113,6 @@ static void
 thread_idle(void *arg)
 {
     struct thread *self;
-    int error;
 
     (void)arg;
 
@@ -2116,13 +2120,6 @@ thread_idle(void *arg)
 
     for (;;) {
         thread_preempt_disable();
-        error = sref_unregister();
-
-        if (error) {
-            assert(error == ERROR_BUSY);
-            goto error_sref;
-        }
-
         llsync_unregister();
 
         for (;;) {
@@ -2137,9 +2134,6 @@ thread_idle(void *arg)
         }
 
         llsync_register();
-        sref_register();
-
-error_sref:
         thread_preempt_enable();
     }
 }
diff --git a/kern/thread.h b/kern/thread.h
index 58322bd..c4dbd5d 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -679,6 +679,16 @@ thread_llsync_read_dec(void)
 }
 
 /*
+ * RCU functions.
+ */
+
+static inline struct rcu_reader *
+thread_rcu_reader(struct thread *thread)
+{
+    return &thread->rcu_reader;
+}
+
+/*
  * Type for thread-specific data destructor.
  */
 typedef void (*thread_dtor_fn_t)(void *);
diff --git a/kern/thread_i.h b/kern/thread_i.h
index 96ce100..979d4b0 100644
--- a/kern/thread_i.h
+++ b/kern/thread_i.h
@@ -24,6 +24,7 @@
 #include <kern/atomic.h>
 #include <kern/cpumap.h>
 #include <kern/list_types.h>
+#include <kern/rcu_types.h>
 #include <kern/spinlock_types.h>
 #include <kern/turnstile_types.h>
 #include <machine/cpu.h>
@@ -137,6 +138,8 @@ struct thread {
     /* Read-side critical section level, not in any if 0 */
     unsigned short llsync_level; /* (-) */
 
+    struct rcu_reader rcu_reader;   /* (-) */
+
     /* Processors on which this thread is allowed to run */
     struct cpumap cpumap;   /* (r) */
 
diff --git a/test/Kconfig b/test/Kconfig
index 8f3ae8b..6a6221b 100644
--- a/test/Kconfig
+++ b/test/Kconfig
@@ -22,6 +22,9 @@ config TEST_MODULE_MUTEX_PI
 config TEST_MODULE_PMAP_UPDATE_MP
 	bool "pmap_update_mp"
 
+config TEST_MODULE_RCU_DEFER
+	bool "rcu_defer"
+
 config TEST_MODULE_SREF_DIRTY_ZEROES
 	bool "sref_dirty_zeroes"
 
diff --git a/test/Makefile b/test/Makefile
index 098cecb..b7ac3d7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -2,6 +2,7 @@ x15_SOURCES-$(CONFIG_TEST_MODULE_LLSYNC_DEFER)          += test/test_llsync_defe
 x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX)                 += test/test_mutex.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX_PI)              += test/test_mutex_pi.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_PMAP_UPDATE_MP)        += test/test_pmap_update_mp.c
+x15_SOURCES-$(CONFIG_TEST_MODULE_RCU_DEFER)             += test/test_rcu_defer.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_DIRTY_ZEROES)     += test/test_sref_dirty_zeroes.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_NOREF)            += test/test_sref_noref.c
 x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_WEAKREF)          += test/test_sref_weakref.c
diff --git a/test/test_rcu_defer.c b/test/test_rcu_defer.c
new file mode 100644
index 0000000..e1ab0e5
--- /dev/null
+++ b/test/test_rcu_defer.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2014-2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This test module is a stress test, expected to never terminate, of the
+ * work deferring functionality of the rcu module. It creates three
+ * threads, a producer, a consumer, and a reader. The producer allocates
+ * a page and writes it. It then transfers the page to the consumer, using
+ * the rcu interface to update the global page pointer. Once at the
+ * consumer, the rcu interface is used to defer the release of the page.
+ * Concurrently, the reader accesses the page and checks its content when
+ * available. These accesses are performed inside a read-side critical
+ * section and should therefore never fail.
+ *
+ * Each thread regularly prints a string to report that it's making progress.
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <kern/condition.h>
+#include <kern/cpumap.h>
+#include <kern/error.h>
+#include <kern/kmem.h>
+#include <kern/macros.h>
+#include <kern/mutex.h>
+#include <kern/panic.h>
+#include <kern/rcu.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+#include <machine/page.h>
+#include <test/test.h>
+#include <vm/vm_kmem.h>
+
+#define TEST_LOOPS_PER_PRINT 100000
+
+struct test_pdsc {
+    struct work work;
+    void *addr;
+};
+
+#define TEST_VALIDATION_BYTE 0xab
+
+static struct mutex test_lock;
+static struct condition test_condition;
+static struct test_pdsc *test_pdsc;
+
+static struct kmem_cache test_pdsc_cache;
+
+static void
+test_alloc(void *arg)
+{
+    struct test_pdsc *pdsc;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    mutex_lock(&test_lock);
+
+    for (;;) {
+        while (test_pdsc != NULL) {
+            condition_wait(&test_condition, &test_lock);
+        }
+
+        pdsc = kmem_cache_alloc(&test_pdsc_cache);
+
+        if (pdsc != NULL) {
+            pdsc->addr = vm_kmem_alloc(PAGE_SIZE);
+
+            if (pdsc->addr != NULL) {
+                memset(pdsc->addr, TEST_VALIDATION_BYTE, PAGE_SIZE);
+            }
+        }
+
+        rcu_store_ptr(test_pdsc, pdsc);
+        condition_signal(&test_condition);
+
+        if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+            printf("alloc ");
+        }
+
+        nr_loops++;
+    }
+}
+
+static void
+test_deferred_free(struct work *work)
+{
+    struct test_pdsc *pdsc;
+
+    pdsc = structof(work, struct test_pdsc, work);
+
+    if (pdsc->addr != NULL) {
+        vm_kmem_free(pdsc->addr, PAGE_SIZE);
+    }
+
+    kmem_cache_free(&test_pdsc_cache, pdsc);
+}
+
+static void
+test_free(void *arg)
+{
+    struct test_pdsc *pdsc;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    mutex_lock(&test_lock);
+
+    for (;;) {
+        while (test_pdsc == NULL) {
+            condition_wait(&test_condition, &test_lock);
+        }
+
+        pdsc = test_pdsc;
+        rcu_store_ptr(test_pdsc, NULL);
+
+        if (pdsc != NULL) {
+            work_init(&pdsc->work, test_deferred_free);
+            rcu_defer(&pdsc->work);
+        }
+
+        condition_signal(&test_condition);
+
+        if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+            printf("free ");
+        }
+
+        nr_loops++;
+    }
+}
+
+static void
+test_read(void *arg)
+{
+    const struct test_pdsc *pdsc;
+    const unsigned char *s;
+    unsigned long nr_loops;
+
+    (void)arg;
+
+    nr_loops = 0;
+
+    for (;;) {
+        rcu_read_enter();
+
+        pdsc = rcu_load_ptr(test_pdsc);
+
+        if (pdsc != NULL) {
+            s = (const unsigned char *)pdsc->addr;
+
+            if (s != NULL) {
+                for (unsigned int i = 0; i < PAGE_SIZE; i++) {
+                    if (s[i] != TEST_VALIDATION_BYTE) {
+                        panic("invalid content");
+                    }
+                }
+
+                if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+                    printf("read ");
+                }
+
+                nr_loops++;
+            }
+        }
+
+        rcu_read_leave();
+    }
+}
+
+void
+test_setup(void)
+{
+    struct thread_attr attr;
+    struct thread *thread;
+    struct cpumap *cpumap;
+    int error;
+
+    condition_init(&test_condition);
+    mutex_init(&test_lock);
+
+    error = cpumap_create(&cpumap);
+    error_check(error, "cpumap_create");
+
+    kmem_cache_init(&test_pdsc_cache, "test_pdsc",
+                    sizeof(struct test_pdsc), 0, NULL, 0);
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_alloc");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, cpu_count() - 1);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_alloc, NULL);
+    error_check(error, "thread_create");
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_free");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, cpu_count() - 1);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_free, NULL);
+    error_check(error, "thread_create");
+
+    thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_read");
+    thread_attr_set_detached(&attr);
+    cpumap_zero(cpumap);
+    cpumap_set(cpumap, 0);
+    thread_attr_set_cpumap(&attr, cpumap);
+    error = thread_create(&thread, &attr, test_read, NULL);
+    error_check(error, "thread_create");
+
+    cpumap_destroy(cpumap);
+}
author	Richard Braun <rbraun@sceen.net>	2018-02-20 20:46:37 +0100
committer	Richard Braun <rbraun@sceen.net>	2018-02-20 20:46:37 +0100
commit	da8eb9c244d27fd042adde6234ccec079681d7f4 (patch)
tree	f0de4b67e3a59dded6598efe3db87c835da72b92
parent	ed9ada07d9c841b3f3f3a3f841aae2792a30779a (diff)
parent	22dff6b7a6839e77d713d671cb2038e56b64ac16 (diff)