summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Braun <rbraun@sceen.net>2018-02-20 20:46:37 +0100
committerRichard Braun <rbraun@sceen.net>2018-02-20 20:46:37 +0100
commitda8eb9c244d27fd042adde6234ccec079681d7f4 (patch)
treef0de4b67e3a59dded6598efe3db87c835da72b92
parented9ada07d9c841b3f3f3a3f841aae2792a30779a (diff)
parent22dff6b7a6839e77d713d671cb2038e56b64ac16 (diff)
Merge branch 'preemptible_rcu'
-rw-r--r--kern/Makefile1
-rw-r--r--kern/clock.c2
-rw-r--r--kern/rcu.c772
-rw-r--r--kern/rcu.h146
-rw-r--r--kern/rcu_i.h61
-rw-r--r--kern/rcu_types.h40
-rw-r--r--kern/thread.c16
-rw-r--r--kern/thread.h10
-rw-r--r--kern/thread_i.h3
-rw-r--r--test/Kconfig3
-rw-r--r--test/Makefile1
-rw-r--r--test/test_rcu_defer.c231
12 files changed, 1275 insertions, 11 deletions
diff --git a/kern/Makefile b/kern/Makefile
index 0aa96fc..6c0778b 100644
--- a/kern/Makefile
+++ b/kern/Makefile
@@ -20,6 +20,7 @@ x15_SOURCES-y += \
kern/plist.c \
kern/printf.c \
kern/rbtree.c \
+ kern/rcu.c \
kern/rdxtree.c \
kern/rtmutex.c \
kern/semaphore.c \
diff --git a/kern/clock.c b/kern/clock.c
index 19f823c..c09ffdd 100644
--- a/kern/clock.c
+++ b/kern/clock.c
@@ -24,6 +24,7 @@
#include <kern/init.h>
#include <kern/llsync.h>
#include <kern/percpu.h>
+#include <kern/rcu.h>
#include <kern/sref.h>
#include <kern/syscnt.h>
#include <kern/thread.h>
@@ -91,6 +92,7 @@ void clock_tick_intr(void)
timer_report_periodic_event();
llsync_report_periodic_event();
+ rcu_report_periodic_event();
sref_report_periodic_event();
work_report_periodic_event();
thread_report_periodic_event();
diff --git a/kern/rcu.c b/kern/rcu.c
new file mode 100644
index 0000000..82c62df
--- /dev/null
+++ b/kern/rcu.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This implementation is based on the paper "Extending RCU for Realtime
+ * and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma,
+ * and Suparna Bhattacharya. Beside the mechanisms not implemented yet,
+ * such as priority boosting, the differences are described below.
+ *
+ * First, this implementation uses scalable reference counters provided
+ * by the sref module instead of per-CPU counters as described in the paper.
+ * The main benefit of this approach is the centralization of most scalability
+ * improvements in the sref module, which should propagate to all sref users,
+ * including RCU.
+ *
+ * In addition, this implementation introduces the concept of windows, where
+ * a window is a range in time to which readers may be linked. Here, a
+ * grace period is defined as the time range at the end of a window where
+ * various synchronization steps are performed to enforce the RCU guarantees.
+ * The minimum duration of a window acts as a knob allowing users to tune
+ * the behavior of the RCU system.
+ *
+ * Finally, the state machine described in the paper is updated to accommodate
+ * for windows, since grace periods don't run back-to-back to each other.
+ * Windows are regularly checked and flipped if the previous one isn't
+ * active any more. From that moment, processors may notice the global flip
+ * and perform a local flip of their work window ID. Once all processors
+ * have acknowleged the flip, it is certain that no new work may be queued
+ * on the previous window. At this point, the same occurs for the
+ * processor-local reader window ID, and once all processors have
+ * acknowleged that flip, there can be no new reader linked to the previous
+ * window. The RCU system then releases its own reference to the previous
+ * window and waits for the window reference counter to drop to 0, indicating
+ * that all readers linked to the previous window have left their read-side
+ * critical section. When this global event occurs, processors are requested
+ * to flush the works queued for the previous window, and once they all have
+ * acknowleged their flush, the window ends and becomes inactive, allowing
+ * a new grace period to occur later on.
+ *
+ * Here is an informal diagram describing this process :
+ *
+ * t ---->
+ *
+ * reader window flip ---+ +--- no more readers
+ * work window flip ------+ | | +- works flushed
+ * (grace period start) | | | | (grace period / window end)
+ * v v v v
+ * +--------------+-+-----+-+
+ * | . . . |
+ * | window 0 . . gp . |
+ * | removal . . . | reclamation
+ * +--------------+-+-----+-+-----+----+
+ * | . |
+ * | window 1 . gp |
+ * | removal . | reclamation
+ * +---------------+----+--------
+ * |
+ * | window 2 ...
+ * |
+ * +-------------
+ *
+ * TODO Improve atomic acknowledgment scalability.
+ * TODO Handle large amounts of deferred works.
+ * TODO Priority boosting of slow readers.
+ * TODO CPU registration for dyntick-friendly behavior.
+ */
+
+#include <assert.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <kern/atomic.h>
+#include <kern/clock.h>
+#include <kern/init.h>
+#include <kern/macros.h>
+#include <kern/rcu.h>
+#include <kern/panic.h>
+#include <kern/percpu.h>
+#include <kern/spinlock.h>
+#include <kern/sref.h>
+#include <kern/syscnt.h>
+#include <kern/thread.h>
+#include <kern/timer.h>
+#include <kern/work.h>
+#include <machine/cpu.h>
+
+/*
+ * Negative close to 0 so that an overflow occurs early.
+ */
+#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500)
+
+/*
+ * Interval between window checking.
+ *
+ * When windows are checked, a flip occurs if the previous window isn't
+ * active any more.
+ */
+#define RCU_WINDOW_CHECK_INTERVAL_MS 10
+
+/*
+ * Grace period states.
+ *
+ * These states are only used to trigger per-CPU processing that is
+ * globally acknowleged by decrementing a global atomic counter. They
+ * do not completely represent the actual state of a grace period.
+ */
+enum rcu_gp_state {
+ RCU_GP_STATE_WORK_WINDOW_FLIP,
+ RCU_GP_STATE_READER_WINDOW_FLIP,
+ RCU_GP_STATE_WORK_FLUSH,
+};
+
+/*
+ * Per-CPU view of a window.
+ *
+ * Deferred works are scheduled when the window ends.
+ */
+struct rcu_cpu_window {
+ struct work_queue works;
+};
+
+/*
+ * Per-CPU RCU data.
+ *
+ * Each processor maintains two local window IDs. One is used as the current
+ * window ID when deferring work, the other when detecting a reader. A local
+ * flip occurs when a processor notices that the global grace period state
+ * no longer matches the local grace period state. These checks only occur
+ * on periodic events.
+ *
+ * Interrupts and preemption must be disabled when accessing local CPU data.
+ */
+struct rcu_cpu_data {
+ enum rcu_gp_state gp_state;
+ unsigned int work_wid;
+ unsigned int reader_wid;
+ struct rcu_cpu_window windows[2];
+ struct syscnt sc_nr_detected_readers;
+};
+
+/*
+ * Global window.
+ *
+ * A window is a time range that tracks read-side references. Conceptually,
+ * each reader adds a reference to the current window. In practice, references
+ * are only added when readers are detected, which occurs on a context switch
+ * (to track preempted threads) or a reader window flip (to prevent currently
+ * running readers to be linked to the next window).
+ *
+ * When a window is started, its scalable reference counter is initialized
+ * with a reference owned by the RCU system. That reference guarantees that
+ * the window remains active as long as new readers may add references,
+ * since it prevents the counter from dropping to 0. After a reader window
+ * flip, there may not be new references to the window, and the initial
+ * reference is dropped, allowing the counter to reach 0 once all detected
+ * readers leave their critical section and unreference the window they're
+ * linked to.
+ */
+struct rcu_window {
+ struct sref_counter nr_refs;
+ uint64_t start_ts;
+ bool active;
+};
+
+/*
+ * Global data.
+ *
+ * Processors regularly check the grace period state against their own,
+ * locally cached grace period state, and take action whenever they differ.
+ * False sharing is avoided by making the global grace period state fill an
+ * entire cache line on SMP.
+ *
+ * After processors notice a grace period state change, they acknowledge
+ * noticing this change by decrementing the atomic acknowledgment counter,
+ * which also fills a complete cache line on SMP in order to restrict cache
+ * line bouncing. Atomic operations on this counter are done with
+ * acquire-release ordering to enforce the memory ordering guarantees
+ * required by the implementation, as well as those provided by the public
+ * interface.
+ *
+ * In addition to the global window ID and the windows themselves, the data
+ * include a timer, used to trigger the end of windows, i.e. grace periods.
+ * Since the timer function, atomic acknowledgments, and window no-reference
+ * function chain each other, there is currently no need for a global lock.
+ */
+struct rcu_data {
+ struct {
+ alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state;
+ };
+ struct {
+ alignas(CPU_L1_SIZE) unsigned int nr_acks;
+ };
+
+ unsigned int wid;
+ struct rcu_window windows[2];
+ struct timer timer;
+ struct syscnt sc_nr_windows;
+ struct syscnt sc_last_window_ms;
+ struct syscnt sc_longest_window_ms;
+};
+
+/*
+ * Structure used to implement rcu_wait().
+ */
+struct rcu_waiter {
+ struct work work;
+ struct spinlock lock;
+ struct thread *thread;
+ bool done;
+};
+
+static struct rcu_data rcu_data;
+static struct rcu_cpu_data rcu_cpu_data __percpu;
+
+static struct rcu_cpu_data *
+rcu_get_cpu_data(void)
+{
+ assert(!cpu_intr_enabled());
+ assert(!thread_preempt_enabled());
+
+ return cpu_local_ptr(rcu_cpu_data);
+}
+
+static enum rcu_gp_state
+rcu_data_get_gp_state(const struct rcu_data *data)
+{
+ return data->gp_state;
+}
+
+static unsigned int
+rcu_data_get_wid(const struct rcu_data *data)
+{
+ return data->wid;
+}
+
+static struct rcu_window *
+rcu_data_get_window_from_index(struct rcu_data *data, size_t index)
+{
+ assert(index < ARRAY_SIZE(data->windows));
+ return &data->windows[index];
+}
+
+static struct rcu_window *
+rcu_data_get_window(struct rcu_data *data, unsigned int wid)
+{
+ return rcu_data_get_window_from_index(data, wid & 1);
+}
+
+static void
+rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state)
+{
+ assert(data->nr_acks == 0);
+
+ switch (gp_state) {
+ case RCU_GP_STATE_WORK_WINDOW_FLIP:
+ assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH);
+ break;
+ case RCU_GP_STATE_READER_WINDOW_FLIP:
+ assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP);
+ break;
+ case RCU_GP_STATE_WORK_FLUSH:
+ assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP);
+ break;
+ default:
+ panic("rcu: invalid grace period state");
+ }
+
+ data->nr_acks = cpu_count();
+ atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE);
+}
+
+static bool
+rcu_data_check_gp_state(const struct rcu_data *data,
+ enum rcu_gp_state local_gp_state,
+ enum rcu_gp_state *global_gp_state)
+{
+ *global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED);
+
+ if (unlikely(local_gp_state != *global_gp_state)) {
+ atomic_fence_acquire();
+ return true;
+ }
+
+ return false;
+}
+
+static void
+rcu_window_end(struct rcu_window *window)
+{
+ assert(window->active);
+ window->active = false;
+}
+
+static void
+rcu_window_ref(struct rcu_window *window)
+{
+ sref_counter_inc(&window->nr_refs);
+}
+
+static void
+rcu_window_unref(struct rcu_window *window)
+{
+ sref_counter_dec(&window->nr_refs);
+}
+
+static uint64_t
+rcu_window_get_start_ts(const struct rcu_window *window)
+{
+ return window->start_ts;
+}
+
+static void
+rcu_window_flush(struct sref_counter *counter)
+{
+ (void)counter;
+
+ rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH);
+}
+
+static void __init
+rcu_window_init(struct rcu_window *window)
+{
+ window->active = false;
+}
+
+static void
+rcu_window_start(struct rcu_window *window)
+{
+ assert(!window->active);
+
+ sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush);
+ window->start_ts = clock_get_time();
+ window->active = true;
+}
+
+static bool
+rcu_window_active(const struct rcu_window *window)
+{
+ return window->active;
+}
+
+static void
+rcu_data_end_prev_window(struct rcu_data *data, uint64_t now)
+{
+ struct rcu_window *window;
+ uint64_t duration;
+
+ window = rcu_data_get_window(data, data->wid - 1);
+ duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window));
+ syscnt_set(&data->sc_last_window_ms, duration);
+
+ if (duration > syscnt_read(&data->sc_longest_window_ms)) {
+ syscnt_set(&data->sc_longest_window_ms, duration);
+ }
+
+ rcu_window_end(window);
+}
+
+static void
+rcu_data_schedule_timer(struct rcu_data *data, uint64_t now)
+{
+ uint64_t ticks;
+
+ ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS);
+ timer_schedule(&data->timer, now + ticks);
+}
+
+static void
+rcu_data_ack_cpu(struct rcu_data *data)
+{
+ struct rcu_window *window;
+ unsigned int prev_nr_acks;
+ uint64_t now;
+
+ prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL);
+
+ if (prev_nr_acks != 1) {
+ assert(prev_nr_acks != 0);
+ return;
+ }
+
+ switch (data->gp_state) {
+ case RCU_GP_STATE_WORK_WINDOW_FLIP:
+ rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP);
+ break;
+ case RCU_GP_STATE_READER_WINDOW_FLIP:
+ window = rcu_data_get_window(data, data->wid - 1);
+ rcu_window_unref(window);
+ break;
+ case RCU_GP_STATE_WORK_FLUSH:
+ now = clock_get_time();
+ rcu_data_end_prev_window(data, now);
+ rcu_data_schedule_timer(data, now);
+ break;
+ default:
+ panic("rcu: invalid grace period state");
+ }
+}
+
+static bool
+rcu_data_flip_windows(struct rcu_data *data)
+{
+ struct rcu_window *window;
+
+ window = rcu_data_get_window(data, data->wid - 1);
+
+ if (rcu_window_active(window)) {
+ return false;
+ }
+
+ rcu_window_start(window);
+ syscnt_inc(&data->sc_nr_windows);
+ data->wid++;
+ rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP);
+ return true;
+}
+
+static void
+rcu_data_check_windows(struct timer *timer)
+{
+ struct rcu_data *data;
+ bool flipped;
+
+ data = &rcu_data;
+ flipped = rcu_data_flip_windows(data);
+
+ if (!flipped) {
+ rcu_data_schedule_timer(data, timer_get_time(timer));
+ }
+}
+
+static void __init
+rcu_data_init(struct rcu_data *data)
+{
+ data->gp_state = RCU_GP_STATE_WORK_FLUSH;
+ data->nr_acks = 0;
+ data->wid = RCU_WINDOW_ID_INIT_VALUE;
+
+ for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) {
+ rcu_window_init(rcu_data_get_window_from_index(data, i));
+ }
+
+ rcu_window_start(rcu_data_get_window(data, data->wid));
+
+ timer_init(&data->timer, rcu_data_check_windows, 0);
+ rcu_data_schedule_timer(data, clock_get_time());
+
+ syscnt_register(&data->sc_nr_windows, "rcu_nr_windows");
+ syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms");
+ syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms");
+}
+
+static void __init
+rcu_cpu_window_init(struct rcu_cpu_window *cpu_window)
+{
+ work_queue_init(&cpu_window->works);
+}
+
+static void
+rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work)
+{
+ work_queue_push(&cpu_window->works, work);
+}
+
+static void
+rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window)
+{
+ work_queue_schedule(&cpu_window->works, 0);
+ work_queue_init(&cpu_window->works);
+}
+
+static unsigned int
+rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data)
+{
+ return cpu_data->reader_wid;
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index)
+{
+ assert(index < ARRAY_SIZE(cpu_data->windows));
+ return &cpu_data->windows[index];
+}
+
+static struct rcu_cpu_window *
+rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid)
+{
+ return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1);
+}
+
+static void __init
+rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu)
+{
+ struct rcu_data *data;
+ char name[SYSCNT_NAME_SIZE];
+
+ data = &rcu_data;
+
+ cpu_data->gp_state = rcu_data_get_gp_state(data);
+ cpu_data->work_wid = rcu_data_get_wid(data);
+ cpu_data->reader_wid = cpu_data->work_wid;
+
+ for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) {
+ rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i));
+ }
+
+ snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu);
+ syscnt_register(&cpu_data->sc_nr_detected_readers, name);
+}
+
+static void
+rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work)
+{
+ struct rcu_cpu_window *cpu_window;
+
+ cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid);
+ rcu_cpu_window_queue(cpu_window, work);
+}
+
+static void
+rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data)
+{
+ struct rcu_cpu_window *cpu_window;
+
+ assert(cpu_data->work_wid == cpu_data->reader_wid);
+
+ cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1);
+ rcu_cpu_window_flush(cpu_window);
+}
+
+void
+rcu_reader_init(struct rcu_reader *reader)
+{
+ reader->level = 0;
+ reader->linked = false;
+}
+
+static void
+rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+ assert(!cpu_intr_enabled());
+ assert(reader == thread_rcu_reader(thread_self()));
+ assert(!rcu_reader_linked(reader));
+
+ reader->wid = rcu_cpu_data_get_reader_wid(cpu_data);
+ reader->linked = true;
+}
+
+static void
+rcu_reader_unlink(struct rcu_reader *reader)
+{
+ assert(reader->level == 0);
+ reader->linked = false;
+}
+
+static void
+rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+ struct rcu_window *window;
+ struct rcu_data *data;
+ unsigned int wid;
+
+ if (rcu_reader_linked(reader)) {
+ return;
+ }
+
+ data = &rcu_data;
+ wid = rcu_cpu_data_get_reader_wid(cpu_data);
+ window = rcu_data_get_window(data, wid);
+
+ rcu_reader_link(reader, cpu_data);
+ rcu_window_ref(window);
+
+ syscnt_inc(&cpu_data->sc_nr_detected_readers);
+}
+
+void
+rcu_reader_leave(struct rcu_reader *reader)
+{
+ struct rcu_window *window;
+ struct rcu_data *data;
+
+ data = &rcu_data;
+
+ window = rcu_data_get_window(data, reader->wid);
+ rcu_window_unref(window);
+ rcu_reader_unlink(reader);
+}
+
+static void
+rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
+{
+ if (rcu_reader_in_cs(reader)) {
+ rcu_reader_enter(reader, cpu_data);
+ }
+}
+
+static void
+rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data)
+{
+ assert(!cpu_intr_enabled());
+ assert(!thread_preempt_enabled());
+
+ cpu_data->work_wid++;
+}
+
+static void
+rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data)
+{
+ assert(!cpu_intr_enabled());
+ assert(!thread_preempt_enabled());
+
+ rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data);
+ cpu_data->reader_wid++;
+}
+
+static void
+rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data)
+{
+ enum rcu_gp_state local_gp_state, global_gp_state;
+ struct rcu_data *data;
+ bool diff;
+
+ data = &rcu_data;
+
+ for (;;) {
+ local_gp_state = cpu_data->gp_state;
+ diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state);
+
+ if (!diff) {
+ break;
+ }
+
+ switch (global_gp_state) {
+ case RCU_GP_STATE_WORK_WINDOW_FLIP:
+ rcu_cpu_data_flip_work_wid(cpu_data);
+ rcu_data_ack_cpu(data);
+ break;
+ case RCU_GP_STATE_READER_WINDOW_FLIP:
+ rcu_cpu_data_flip_reader_wid(cpu_data);
+ rcu_data_ack_cpu(data);
+ break;
+ case RCU_GP_STATE_WORK_FLUSH:
+ rcu_cpu_data_flush(cpu_data);
+ rcu_data_ack_cpu(data);
+ break;
+ default:
+ panic("rcu: invalid grace period state");
+ }
+
+ cpu_data->gp_state = global_gp_state;
+ }
+}
+
+void
+rcu_report_context_switch(struct rcu_reader *reader)
+{
+ assert(!cpu_intr_enabled());
+ assert(!thread_preempt_enabled());
+
+ /*
+ * Most readers don't need to be accounted for because their execution
+ * doesn't overlap with a grace period. If a reader is preempted however,
+ * it must be accounted in case a grace period starts while the reader
+ * is preempted. Accounting also occurs when a grace period starts, and
+ * more exactly, when the reader window ID of a processor is flipped.
+ */
+ rcu_reader_account(reader, rcu_get_cpu_data());
+}
+
+void
+rcu_report_periodic_event(void)
+{
+ assert(!cpu_intr_enabled());
+ assert(!thread_preempt_enabled());
+
+ rcu_cpu_data_check_gp_state(rcu_get_cpu_data());
+}
+
+void
+rcu_defer(struct work *work)
+{
+ struct rcu_cpu_data *cpu_data;
+ unsigned long flags;
+
+ thread_preempt_disable_intr_save(&flags);
+ cpu_data = rcu_get_cpu_data();
+ rcu_cpu_data_queue(cpu_data, work);
+ thread_preempt_enable_intr_restore(flags);
+}
+
+static void
+rcu_waiter_wakeup(struct work *work)
+{
+ struct rcu_waiter *waiter;
+
+ waiter = structof(work, struct rcu_waiter, work);
+
+ spinlock_lock(&waiter->lock);
+ waiter->done = true;
+ thread_wakeup(waiter->thread);
+ spinlock_unlock(&waiter->lock);
+}
+
+static void
+rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread)
+{
+ work_init(&waiter->work, rcu_waiter_wakeup);
+ spinlock_init(&waiter->lock);
+ waiter->thread = thread;
+ waiter->done = false;
+}
+
+static void
+rcu_waiter_wait(struct rcu_waiter *waiter)
+{
+ rcu_defer(&waiter->work);
+
+ spinlock_lock(&waiter->lock);
+
+ while (!waiter->done) {
+ thread_sleep(&waiter->lock, waiter, "rcu_wait");
+ }
+
+ spinlock_unlock(&waiter->lock);
+}
+
+void
+rcu_wait(void)
+{
+ struct rcu_waiter waiter;
+
+ rcu_waiter_init(&waiter, thread_self()),
+ rcu_waiter_wait(&waiter);
+}
+
+static int __init
+rcu_setup(void)
+{
+ rcu_data_init(&rcu_data);
+
+ for (size_t i = 0; i < cpu_count(); i++) {
+ rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i);
+ }
+
+ return 0;
+}
+
+INIT_OP_DEFINE(rcu_setup,
+ INIT_OP_DEP(clock_setup, true),
+ INIT_OP_DEP(cpu_mp_probe, true),
+ INIT_OP_DEP(spinlock_setup, true),
+ INIT_OP_DEP(sref_setup, true),
+ INIT_OP_DEP(syscnt_setup, true),
+ INIT_OP_DEP(thread_bootstrap, true),
+ INIT_OP_DEP(timer_setup, true),
+ INIT_OP_DEP(work_setup, true));
diff --git a/kern/rcu.h b/kern/rcu.h
new file mode 100644
index 0000000..1ddae7d
--- /dev/null
+++ b/kern/rcu.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Read-Copy Update.
+ *
+ * This module provides a synchronization framework for read-only lockless
+ * operations, best used on read-mostly data structures.
+ *
+ * To learn more about RCU, see "Is Parallel Programming Hard, And, If So,
+ * What Can You Do About It ?" by Paul E. McKenney.
+ *
+ * A thorough discussion of RCU's requirements can be found at [1].
+ * However, the memory model and terminology used in that document being
+ * different from C11, the memory barrier guarantees are hereby translated
+ * into memory ordering guarantees :
+ *
+ * 1. Release-acquire ordering is enforced between the end of all
+ * read-side critical sections starting before a grace period starts
+ * and the start of all works deferred before the same grace period
+ * starts.
+ * 2. Release-acquire ordering is enforced between the start of all
+ * read-side critical sections ending after a grace period ends
+ * and all the work deferrals done before the same grace period
+ * starts.
+ * 3. Release-acquire ordering is enforced between all the work deferrals
+ * done before a grace period starts and the start of all works deferred
+ * before the same grace period starts.
+ *
+ * [1] https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.html.
+ */
+
+#ifndef _KERN_RCU_H
+#define _KERN_RCU_H
+
+#include <kern/atomic.h>
+#include <kern/init.h>
+#include <kern/rcu_i.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+
+/*
+ * Thread-local RCU data.
+ */
+struct rcu_reader;
+
+/*
+ * Safely store a pointer.
+ *
+ * This macro enforces release ordering on the given pointer.
+ */
+#define rcu_store_ptr(ptr, value) atomic_store(&(ptr), value, ATOMIC_RELEASE)
+
+/*
+ * Safely load a pointer.
+ *
+ * This macro enforces consume ordering on the given pointer.
+ */
+#define rcu_load_ptr(ptr) atomic_load(&(ptr), ATOMIC_CONSUME)
+
+/*
+ * Read-side critical section functions.
+ *
+ * Critical sections are preemptible, may safely nest, and may safely
+ * be used in interrupt context. It is not allowed to sleep inside a
+ * read-side critical section. However, it is allowed to acquire locks
+ * that don't sleep, such as spin locks.
+ *
+ * Entering or leaving a critical section doesn't provide any memory
+ * ordering guarantee.
+ */
+
+static inline void
+rcu_read_enter(void)
+{
+ rcu_reader_inc(thread_rcu_reader(thread_self()));
+ barrier();
+}
+
+static inline void
+rcu_read_leave(void)
+{
+ barrier();
+ rcu_reader_dec(thread_rcu_reader(thread_self()));
+}
+
+/*
+ * Initialize a RCU reader.
+ */
+void rcu_reader_init(struct rcu_reader *reader);
+
+/*
+ * Report a context switch on the current processor.
+ *
+ * The argument is the RCU reader of the preempted thread.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_context_switch(struct rcu_reader *reader);
+
+/*
+ * Report a periodic event on the current processor.
+ *
+ * Interrupts and preemption must be disabled when calling this function.
+ */
+void rcu_report_periodic_event(void);
+
+/*
+ * Defer a work until all existing read-side references are dropped,
+ * without blocking.
+ *
+ * The given work is scheduled when all existing read-side references
+ * are dropped.
+ */
+void rcu_defer(struct work *work);
+
+/*
+ * Wait for all existing read-side references to be dropped.
+ *
+ * This function sleeps, and may do so for a moderately long duration,
+ * at leat a few system timer ticks, sometimes a lot more.
+ */
+void rcu_wait(void);
+
+/*
+ * This init operation provides :
+ * - read-side critical sections may be used
+ * - module fully initialized
+ */
+INIT_OP_DECLARE(rcu_setup);
+
+#endif /* _KERN_RCU_H */
diff --git a/kern/rcu_i.h b/kern/rcu_i.h
new file mode 100644
index 0000000..6be933d
--- /dev/null
+++ b/kern/rcu_i.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _KERN_RCU_I_H
+#define _KERN_RCU_I_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include <kern/macros.h>
+#include <kern/rcu_types.h>
+#include <kern/thread.h>
+
+void rcu_reader_leave(struct rcu_reader *reader);
+
+static inline bool
+rcu_reader_in_cs(const struct rcu_reader *reader)
+{
+ return reader->level != 0;
+}
+
+static inline bool
+rcu_reader_linked(const struct rcu_reader *reader)
+{
+ assert(reader == thread_rcu_reader(thread_self()));
+ return reader->linked;
+}
+
+static inline void
+rcu_reader_inc(struct rcu_reader *reader)
+{
+ reader->level++;
+ assert(reader->level != 0);
+}
+
+static inline void
+rcu_reader_dec(struct rcu_reader *reader)
+{
+ assert(reader->level != 0);
+ reader->level--;
+
+ if (unlikely(!rcu_reader_in_cs(reader) && rcu_reader_linked(reader))) {
+ rcu_reader_leave(reader);
+ }
+}
+
+#endif /* _KERN_RCU_I_H */
diff --git a/kern/rcu_types.h b/kern/rcu_types.h
new file mode 100644
index 0000000..12b8913
--- /dev/null
+++ b/kern/rcu_types.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Isolated type definition used to avoid inclusion circular dependencies.
+ */
+
+#ifndef _KERN_RCU_TYPES_H
+#define _KERN_RCU_TYPES_H
+
+#include <stdbool.h>
+
+/*
+ * Thread-local data used to track threads running read-side critical
+ * sections.
+ *
+ * The window ID is valid if and only if the reader is linked.
+ *
+ * Interrupts and preemption must be disabled when accessing a reader.
+ */
+struct rcu_reader {
+ unsigned int level;
+ unsigned int wid;
+ bool linked;
+};
+
+#endif /* _KERN_RCU_TYPES_H */
diff --git a/kern/thread.c b/kern/thread.c
index 20378ff..8460f3b 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -101,6 +101,7 @@
#include <kern/macros.h>
#include <kern/panic.h>
#include <kern/percpu.h>
+#include <kern/rcu.h>
#include <kern/shell.h>
#include <kern/sleepq.h>
#include <kern/spinlock.h>
@@ -641,6 +642,8 @@ thread_runq_schedule(struct thread_runq *runq)
assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL);
if (likely(prev != next)) {
+ rcu_report_context_switch(thread_rcu_reader(prev));
+
/*
* That's where the true context switch occurs. The next thread must
* unlock the run queue and reenable preemption. Note that unlocking
@@ -1700,6 +1703,7 @@ thread_init_booter(unsigned int cpu)
booter->flags = 0;
booter->intr_level = 0;
booter->preempt_level = 1;
+ rcu_reader_init(&booter->rcu_reader);
cpumap_fill(&booter->cpumap);
thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE);
thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE);
@@ -1828,6 +1832,7 @@ thread_init(struct thread *thread, void *stack,
thread->pin_level = 0;
thread->intr_level = 0;
thread->llsync_level = 0;
+ rcu_reader_init(&thread->rcu_reader);
cpumap_copy(&thread->cpumap, cpumap);
thread_set_user_sched_policy(thread, attr->policy);
thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy));
@@ -2108,7 +2113,6 @@ static void
thread_idle(void *arg)
{
struct thread *self;
- int error;
(void)arg;
@@ -2116,13 +2120,6 @@ thread_idle(void *arg)
for (;;) {
thread_preempt_disable();
- error = sref_unregister();
-
- if (error) {
- assert(error == ERROR_BUSY);
- goto error_sref;
- }
-
llsync_unregister();
for (;;) {
@@ -2137,9 +2134,6 @@ thread_idle(void *arg)
}
llsync_register();
- sref_register();
-
-error_sref:
thread_preempt_enable();
}
}
diff --git a/kern/thread.h b/kern/thread.h
index 58322bd..c4dbd5d 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -679,6 +679,16 @@ thread_llsync_read_dec(void)
}
/*
+ * RCU functions.
+ */
+
+static inline struct rcu_reader *
+thread_rcu_reader(struct thread *thread)
+{
+ return &thread->rcu_reader;
+}
+
+/*
* Type for thread-specific data destructor.
*/
typedef void (*thread_dtor_fn_t)(void *);
diff --git a/kern/thread_i.h b/kern/thread_i.h
index 96ce100..979d4b0 100644
--- a/kern/thread_i.h
+++ b/kern/thread_i.h
@@ -24,6 +24,7 @@
#include <kern/atomic.h>
#include <kern/cpumap.h>
#include <kern/list_types.h>
+#include <kern/rcu_types.h>
#include <kern/spinlock_types.h>
#include <kern/turnstile_types.h>
#include <machine/cpu.h>
@@ -137,6 +138,8 @@ struct thread {
/* Read-side critical section level, not in any if 0 */
unsigned short llsync_level; /* (-) */
+ struct rcu_reader rcu_reader; /* (-) */
+
/* Processors on which this thread is allowed to run */
struct cpumap cpumap; /* (r) */
diff --git a/test/Kconfig b/test/Kconfig
index 8f3ae8b..6a6221b 100644
--- a/test/Kconfig
+++ b/test/Kconfig
@@ -22,6 +22,9 @@ config TEST_MODULE_MUTEX_PI
config TEST_MODULE_PMAP_UPDATE_MP
bool "pmap_update_mp"
+config TEST_MODULE_RCU_DEFER
+ bool "rcu_defer"
+
config TEST_MODULE_SREF_DIRTY_ZEROES
bool "sref_dirty_zeroes"
diff --git a/test/Makefile b/test/Makefile
index 098cecb..b7ac3d7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -2,6 +2,7 @@ x15_SOURCES-$(CONFIG_TEST_MODULE_LLSYNC_DEFER) += test/test_llsync_defe
x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX) += test/test_mutex.c
x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX_PI) += test/test_mutex_pi.c
x15_SOURCES-$(CONFIG_TEST_MODULE_PMAP_UPDATE_MP) += test/test_pmap_update_mp.c
+x15_SOURCES-$(CONFIG_TEST_MODULE_RCU_DEFER) += test/test_rcu_defer.c
x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_DIRTY_ZEROES) += test/test_sref_dirty_zeroes.c
x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_NOREF) += test/test_sref_noref.c
x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_WEAKREF) += test/test_sref_weakref.c
diff --git a/test/test_rcu_defer.c b/test/test_rcu_defer.c
new file mode 100644
index 0000000..e1ab0e5
--- /dev/null
+++ b/test/test_rcu_defer.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2014-2018 Richard Braun.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This test module is a stress test, expected to never terminate, of the
+ * work deferring functionality of the rcu module. It creates three
+ * threads, a producer, a consumer, and a reader. The producer allocates
+ * a page and writes it. It then transfers the page to the consumer, using
+ * the rcu interface to update the global page pointer. Once at the
+ * consumer, the rcu interface is used to defer the release of the page.
+ * Concurrently, the reader accesses the page and checks its content when
+ * available. These accesses are performed inside a read-side critical
+ * section and should therefore never fail.
+ *
+ * Each thread regularly prints a string to report that it's making progress.
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <kern/condition.h>
+#include <kern/cpumap.h>
+#include <kern/error.h>
+#include <kern/kmem.h>
+#include <kern/macros.h>
+#include <kern/mutex.h>
+#include <kern/panic.h>
+#include <kern/rcu.h>
+#include <kern/thread.h>
+#include <kern/work.h>
+#include <machine/page.h>
+#include <test/test.h>
+#include <vm/vm_kmem.h>
+
+#define TEST_LOOPS_PER_PRINT 100000
+
+struct test_pdsc {
+ struct work work;
+ void *addr;
+};
+
+#define TEST_VALIDATION_BYTE 0xab
+
+static struct mutex test_lock;
+static struct condition test_condition;
+static struct test_pdsc *test_pdsc;
+
+static struct kmem_cache test_pdsc_cache;
+
+static void
+test_alloc(void *arg)
+{
+ struct test_pdsc *pdsc;
+ unsigned long nr_loops;
+
+ (void)arg;
+
+ nr_loops = 0;
+
+ mutex_lock(&test_lock);
+
+ for (;;) {
+ while (test_pdsc != NULL) {
+ condition_wait(&test_condition, &test_lock);
+ }
+
+ pdsc = kmem_cache_alloc(&test_pdsc_cache);
+
+ if (pdsc != NULL) {
+ pdsc->addr = vm_kmem_alloc(PAGE_SIZE);
+
+ if (pdsc->addr != NULL) {
+ memset(pdsc->addr, TEST_VALIDATION_BYTE, PAGE_SIZE);
+ }
+ }
+
+ rcu_store_ptr(test_pdsc, pdsc);
+ condition_signal(&test_condition);
+
+ if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+ printf("alloc ");
+ }
+
+ nr_loops++;
+ }
+}
+
+static void
+test_deferred_free(struct work *work)
+{
+ struct test_pdsc *pdsc;
+
+ pdsc = structof(work, struct test_pdsc, work);
+
+ if (pdsc->addr != NULL) {
+ vm_kmem_free(pdsc->addr, PAGE_SIZE);
+ }
+
+ kmem_cache_free(&test_pdsc_cache, pdsc);
+}
+
+static void
+test_free(void *arg)
+{
+ struct test_pdsc *pdsc;
+ unsigned long nr_loops;
+
+ (void)arg;
+
+ nr_loops = 0;
+
+ mutex_lock(&test_lock);
+
+ for (;;) {
+ while (test_pdsc == NULL) {
+ condition_wait(&test_condition, &test_lock);
+ }
+
+ pdsc = test_pdsc;
+ rcu_store_ptr(test_pdsc, NULL);
+
+ if (pdsc != NULL) {
+ work_init(&pdsc->work, test_deferred_free);
+ rcu_defer(&pdsc->work);
+ }
+
+ condition_signal(&test_condition);
+
+ if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+ printf("free ");
+ }
+
+ nr_loops++;
+ }
+}
+
+static void
+test_read(void *arg)
+{
+ const struct test_pdsc *pdsc;
+ const unsigned char *s;
+ unsigned long nr_loops;
+
+ (void)arg;
+
+ nr_loops = 0;
+
+ for (;;) {
+ rcu_read_enter();
+
+ pdsc = rcu_load_ptr(test_pdsc);
+
+ if (pdsc != NULL) {
+ s = (const unsigned char *)pdsc->addr;
+
+ if (s != NULL) {
+ for (unsigned int i = 0; i < PAGE_SIZE; i++) {
+ if (s[i] != TEST_VALIDATION_BYTE) {
+ panic("invalid content");
+ }
+ }
+
+ if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) {
+ printf("read ");
+ }
+
+ nr_loops++;
+ }
+ }
+
+ rcu_read_leave();
+ }
+}
+
+void
+test_setup(void)
+{
+ struct thread_attr attr;
+ struct thread *thread;
+ struct cpumap *cpumap;
+ int error;
+
+ condition_init(&test_condition);
+ mutex_init(&test_lock);
+
+ error = cpumap_create(&cpumap);
+ error_check(error, "cpumap_create");
+
+ kmem_cache_init(&test_pdsc_cache, "test_pdsc",
+ sizeof(struct test_pdsc), 0, NULL, 0);
+
+ thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_alloc");
+ thread_attr_set_detached(&attr);
+ cpumap_zero(cpumap);
+ cpumap_set(cpumap, cpu_count() - 1);
+ thread_attr_set_cpumap(&attr, cpumap);
+ error = thread_create(&thread, &attr, test_alloc, NULL);
+ error_check(error, "thread_create");
+
+ thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_free");
+ thread_attr_set_detached(&attr);
+ cpumap_zero(cpumap);
+ cpumap_set(cpumap, cpu_count() - 1);
+ thread_attr_set_cpumap(&attr, cpumap);
+ error = thread_create(&thread, &attr, test_free, NULL);
+ error_check(error, "thread_create");
+
+ thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_read");
+ thread_attr_set_detached(&attr);
+ cpumap_zero(cpumap);
+ cpumap_set(cpumap, 0);
+ thread_attr_set_cpumap(&attr, cpumap);
+ error = thread_create(&thread, &attr, test_read, NULL);
+ error_check(error, "thread_create");
+
+ cpumap_destroy(cpumap);
+}