diff options
author | Richard Braun <rbraun@sceen.net> | 2018-02-20 20:46:37 +0100 |
---|---|---|
committer | Richard Braun <rbraun@sceen.net> | 2018-02-20 20:46:37 +0100 |
commit | da8eb9c244d27fd042adde6234ccec079681d7f4 (patch) | |
tree | f0de4b67e3a59dded6598efe3db87c835da72b92 | |
parent | ed9ada07d9c841b3f3f3a3f841aae2792a30779a (diff) | |
parent | 22dff6b7a6839e77d713d671cb2038e56b64ac16 (diff) |
Merge branch 'preemptible_rcu'
-rw-r--r-- | kern/Makefile | 1 | ||||
-rw-r--r-- | kern/clock.c | 2 | ||||
-rw-r--r-- | kern/rcu.c | 772 | ||||
-rw-r--r-- | kern/rcu.h | 146 | ||||
-rw-r--r-- | kern/rcu_i.h | 61 | ||||
-rw-r--r-- | kern/rcu_types.h | 40 | ||||
-rw-r--r-- | kern/thread.c | 16 | ||||
-rw-r--r-- | kern/thread.h | 10 | ||||
-rw-r--r-- | kern/thread_i.h | 3 | ||||
-rw-r--r-- | test/Kconfig | 3 | ||||
-rw-r--r-- | test/Makefile | 1 | ||||
-rw-r--r-- | test/test_rcu_defer.c | 231 |
12 files changed, 1275 insertions, 11 deletions
diff --git a/kern/Makefile b/kern/Makefile index 0aa96fc..6c0778b 100644 --- a/kern/Makefile +++ b/kern/Makefile @@ -20,6 +20,7 @@ x15_SOURCES-y += \ kern/plist.c \ kern/printf.c \ kern/rbtree.c \ + kern/rcu.c \ kern/rdxtree.c \ kern/rtmutex.c \ kern/semaphore.c \ diff --git a/kern/clock.c b/kern/clock.c index 19f823c..c09ffdd 100644 --- a/kern/clock.c +++ b/kern/clock.c @@ -24,6 +24,7 @@ #include <kern/init.h> #include <kern/llsync.h> #include <kern/percpu.h> +#include <kern/rcu.h> #include <kern/sref.h> #include <kern/syscnt.h> #include <kern/thread.h> @@ -91,6 +92,7 @@ void clock_tick_intr(void) timer_report_periodic_event(); llsync_report_periodic_event(); + rcu_report_periodic_event(); sref_report_periodic_event(); work_report_periodic_event(); thread_report_periodic_event(); diff --git a/kern/rcu.c b/kern/rcu.c new file mode 100644 index 0000000..82c62df --- /dev/null +++ b/kern/rcu.c @@ -0,0 +1,772 @@ +/* + * Copyright (c) 2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * This implementation is based on the paper "Extending RCU for Realtime + * and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma, + * and Suparna Bhattacharya. Beside the mechanisms not implemented yet, + * such as priority boosting, the differences are described below. + * + * First, this implementation uses scalable reference counters provided + * by the sref module instead of per-CPU counters as described in the paper. + * The main benefit of this approach is the centralization of most scalability + * improvements in the sref module, which should propagate to all sref users, + * including RCU. + * + * In addition, this implementation introduces the concept of windows, where + * a window is a range in time to which readers may be linked. Here, a + * grace period is defined as the time range at the end of a window where + * various synchronization steps are performed to enforce the RCU guarantees. + * The minimum duration of a window acts as a knob allowing users to tune + * the behavior of the RCU system. + * + * Finally, the state machine described in the paper is updated to accommodate + * for windows, since grace periods don't run back-to-back to each other. + * Windows are regularly checked and flipped if the previous one isn't + * active any more. From that moment, processors may notice the global flip + * and perform a local flip of their work window ID. Once all processors + * have acknowleged the flip, it is certain that no new work may be queued + * on the previous window. At this point, the same occurs for the + * processor-local reader window ID, and once all processors have + * acknowleged that flip, there can be no new reader linked to the previous + * window. The RCU system then releases its own reference to the previous + * window and waits for the window reference counter to drop to 0, indicating + * that all readers linked to the previous window have left their read-side + * critical section. When this global event occurs, processors are requested + * to flush the works queued for the previous window, and once they all have + * acknowleged their flush, the window ends and becomes inactive, allowing + * a new grace period to occur later on. + * + * Here is an informal diagram describing this process : + * + * t ----> + * + * reader window flip ---+ +--- no more readers + * work window flip ------+ | | +- works flushed + * (grace period start) | | | | (grace period / window end) + * v v v v + * +--------------+-+-----+-+ + * | . . . | + * | window 0 . . gp . | + * | removal . . . | reclamation + * +--------------+-+-----+-+-----+----+ + * | . | + * | window 1 . gp | + * | removal . | reclamation + * +---------------+----+-------- + * | + * | window 2 ... + * | + * +------------- + * + * TODO Improve atomic acknowledgment scalability. + * TODO Handle large amounts of deferred works. + * TODO Priority boosting of slow readers. + * TODO CPU registration for dyntick-friendly behavior. + */ + +#include <assert.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> + +#include <kern/atomic.h> +#include <kern/clock.h> +#include <kern/init.h> +#include <kern/macros.h> +#include <kern/rcu.h> +#include <kern/panic.h> +#include <kern/percpu.h> +#include <kern/spinlock.h> +#include <kern/sref.h> +#include <kern/syscnt.h> +#include <kern/thread.h> +#include <kern/timer.h> +#include <kern/work.h> +#include <machine/cpu.h> + +/* + * Negative close to 0 so that an overflow occurs early. + */ +#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500) + +/* + * Interval between window checking. + * + * When windows are checked, a flip occurs if the previous window isn't + * active any more. + */ +#define RCU_WINDOW_CHECK_INTERVAL_MS 10 + +/* + * Grace period states. + * + * These states are only used to trigger per-CPU processing that is + * globally acknowleged by decrementing a global atomic counter. They + * do not completely represent the actual state of a grace period. + */ +enum rcu_gp_state { + RCU_GP_STATE_WORK_WINDOW_FLIP, + RCU_GP_STATE_READER_WINDOW_FLIP, + RCU_GP_STATE_WORK_FLUSH, +}; + +/* + * Per-CPU view of a window. + * + * Deferred works are scheduled when the window ends. + */ +struct rcu_cpu_window { + struct work_queue works; +}; + +/* + * Per-CPU RCU data. + * + * Each processor maintains two local window IDs. One is used as the current + * window ID when deferring work, the other when detecting a reader. A local + * flip occurs when a processor notices that the global grace period state + * no longer matches the local grace period state. These checks only occur + * on periodic events. + * + * Interrupts and preemption must be disabled when accessing local CPU data. + */ +struct rcu_cpu_data { + enum rcu_gp_state gp_state; + unsigned int work_wid; + unsigned int reader_wid; + struct rcu_cpu_window windows[2]; + struct syscnt sc_nr_detected_readers; +}; + +/* + * Global window. + * + * A window is a time range that tracks read-side references. Conceptually, + * each reader adds a reference to the current window. In practice, references + * are only added when readers are detected, which occurs on a context switch + * (to track preempted threads) or a reader window flip (to prevent currently + * running readers to be linked to the next window). + * + * When a window is started, its scalable reference counter is initialized + * with a reference owned by the RCU system. That reference guarantees that + * the window remains active as long as new readers may add references, + * since it prevents the counter from dropping to 0. After a reader window + * flip, there may not be new references to the window, and the initial + * reference is dropped, allowing the counter to reach 0 once all detected + * readers leave their critical section and unreference the window they're + * linked to. + */ +struct rcu_window { + struct sref_counter nr_refs; + uint64_t start_ts; + bool active; +}; + +/* + * Global data. + * + * Processors regularly check the grace period state against their own, + * locally cached grace period state, and take action whenever they differ. + * False sharing is avoided by making the global grace period state fill an + * entire cache line on SMP. + * + * After processors notice a grace period state change, they acknowledge + * noticing this change by decrementing the atomic acknowledgment counter, + * which also fills a complete cache line on SMP in order to restrict cache + * line bouncing. Atomic operations on this counter are done with + * acquire-release ordering to enforce the memory ordering guarantees + * required by the implementation, as well as those provided by the public + * interface. + * + * In addition to the global window ID and the windows themselves, the data + * include a timer, used to trigger the end of windows, i.e. grace periods. + * Since the timer function, atomic acknowledgments, and window no-reference + * function chain each other, there is currently no need for a global lock. + */ +struct rcu_data { + struct { + alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state; + }; + struct { + alignas(CPU_L1_SIZE) unsigned int nr_acks; + }; + + unsigned int wid; + struct rcu_window windows[2]; + struct timer timer; + struct syscnt sc_nr_windows; + struct syscnt sc_last_window_ms; + struct syscnt sc_longest_window_ms; +}; + +/* + * Structure used to implement rcu_wait(). + */ +struct rcu_waiter { + struct work work; + struct spinlock lock; + struct thread *thread; + bool done; +}; + +static struct rcu_data rcu_data; +static struct rcu_cpu_data rcu_cpu_data __percpu; + +static struct rcu_cpu_data * +rcu_get_cpu_data(void) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + return cpu_local_ptr(rcu_cpu_data); +} + +static enum rcu_gp_state +rcu_data_get_gp_state(const struct rcu_data *data) +{ + return data->gp_state; +} + +static unsigned int +rcu_data_get_wid(const struct rcu_data *data) +{ + return data->wid; +} + +static struct rcu_window * +rcu_data_get_window_from_index(struct rcu_data *data, size_t index) +{ + assert(index < ARRAY_SIZE(data->windows)); + return &data->windows[index]; +} + +static struct rcu_window * +rcu_data_get_window(struct rcu_data *data, unsigned int wid) +{ + return rcu_data_get_window_from_index(data, wid & 1); +} + +static void +rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state) +{ + assert(data->nr_acks == 0); + + switch (gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP); + break; + case RCU_GP_STATE_WORK_FLUSH: + assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP); + break; + default: + panic("rcu: invalid grace period state"); + } + + data->nr_acks = cpu_count(); + atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE); +} + +static bool +rcu_data_check_gp_state(const struct rcu_data *data, + enum rcu_gp_state local_gp_state, + enum rcu_gp_state *global_gp_state) +{ + *global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED); + + if (unlikely(local_gp_state != *global_gp_state)) { + atomic_fence_acquire(); + return true; + } + + return false; +} + +static void +rcu_window_end(struct rcu_window *window) +{ + assert(window->active); + window->active = false; +} + +static void +rcu_window_ref(struct rcu_window *window) +{ + sref_counter_inc(&window->nr_refs); +} + +static void +rcu_window_unref(struct rcu_window *window) +{ + sref_counter_dec(&window->nr_refs); +} + +static uint64_t +rcu_window_get_start_ts(const struct rcu_window *window) +{ + return window->start_ts; +} + +static void +rcu_window_flush(struct sref_counter *counter) +{ + (void)counter; + + rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH); +} + +static void __init +rcu_window_init(struct rcu_window *window) +{ + window->active = false; +} + +static void +rcu_window_start(struct rcu_window *window) +{ + assert(!window->active); + + sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush); + window->start_ts = clock_get_time(); + window->active = true; +} + +static bool +rcu_window_active(const struct rcu_window *window) +{ + return window->active; +} + +static void +rcu_data_end_prev_window(struct rcu_data *data, uint64_t now) +{ + struct rcu_window *window; + uint64_t duration; + + window = rcu_data_get_window(data, data->wid - 1); + duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window)); + syscnt_set(&data->sc_last_window_ms, duration); + + if (duration > syscnt_read(&data->sc_longest_window_ms)) { + syscnt_set(&data->sc_longest_window_ms, duration); + } + + rcu_window_end(window); +} + +static void +rcu_data_schedule_timer(struct rcu_data *data, uint64_t now) +{ + uint64_t ticks; + + ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS); + timer_schedule(&data->timer, now + ticks); +} + +static void +rcu_data_ack_cpu(struct rcu_data *data) +{ + struct rcu_window *window; + unsigned int prev_nr_acks; + uint64_t now; + + prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL); + + if (prev_nr_acks != 1) { + assert(prev_nr_acks != 0); + return; + } + + switch (data->gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + window = rcu_data_get_window(data, data->wid - 1); + rcu_window_unref(window); + break; + case RCU_GP_STATE_WORK_FLUSH: + now = clock_get_time(); + rcu_data_end_prev_window(data, now); + rcu_data_schedule_timer(data, now); + break; + default: + panic("rcu: invalid grace period state"); + } +} + +static bool +rcu_data_flip_windows(struct rcu_data *data) +{ + struct rcu_window *window; + + window = rcu_data_get_window(data, data->wid - 1); + + if (rcu_window_active(window)) { + return false; + } + + rcu_window_start(window); + syscnt_inc(&data->sc_nr_windows); + data->wid++; + rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP); + return true; +} + +static void +rcu_data_check_windows(struct timer *timer) +{ + struct rcu_data *data; + bool flipped; + + data = &rcu_data; + flipped = rcu_data_flip_windows(data); + + if (!flipped) { + rcu_data_schedule_timer(data, timer_get_time(timer)); + } +} + +static void __init +rcu_data_init(struct rcu_data *data) +{ + data->gp_state = RCU_GP_STATE_WORK_FLUSH; + data->nr_acks = 0; + data->wid = RCU_WINDOW_ID_INIT_VALUE; + + for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) { + rcu_window_init(rcu_data_get_window_from_index(data, i)); + } + + rcu_window_start(rcu_data_get_window(data, data->wid)); + + timer_init(&data->timer, rcu_data_check_windows, 0); + rcu_data_schedule_timer(data, clock_get_time()); + + syscnt_register(&data->sc_nr_windows, "rcu_nr_windows"); + syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms"); + syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms"); +} + +static void __init +rcu_cpu_window_init(struct rcu_cpu_window *cpu_window) +{ + work_queue_init(&cpu_window->works); +} + +static void +rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work) +{ + work_queue_push(&cpu_window->works, work); +} + +static void +rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window) +{ + work_queue_schedule(&cpu_window->works, 0); + work_queue_init(&cpu_window->works); +} + +static unsigned int +rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data) +{ + return cpu_data->reader_wid; +} + +static struct rcu_cpu_window * +rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index) +{ + assert(index < ARRAY_SIZE(cpu_data->windows)); + return &cpu_data->windows[index]; +} + +static struct rcu_cpu_window * +rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid) +{ + return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1); +} + +static void __init +rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu) +{ + struct rcu_data *data; + char name[SYSCNT_NAME_SIZE]; + + data = &rcu_data; + + cpu_data->gp_state = rcu_data_get_gp_state(data); + cpu_data->work_wid = rcu_data_get_wid(data); + cpu_data->reader_wid = cpu_data->work_wid; + + for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) { + rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i)); + } + + snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu); + syscnt_register(&cpu_data->sc_nr_detected_readers, name); +} + +static void +rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work) +{ + struct rcu_cpu_window *cpu_window; + + cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid); + rcu_cpu_window_queue(cpu_window, work); +} + +static void +rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data) +{ + struct rcu_cpu_window *cpu_window; + + assert(cpu_data->work_wid == cpu_data->reader_wid); + + cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1); + rcu_cpu_window_flush(cpu_window); +} + +void +rcu_reader_init(struct rcu_reader *reader) +{ + reader->level = 0; + reader->linked = false; +} + +static void +rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(reader == thread_rcu_reader(thread_self())); + assert(!rcu_reader_linked(reader)); + + reader->wid = rcu_cpu_data_get_reader_wid(cpu_data); + reader->linked = true; +} + +static void +rcu_reader_unlink(struct rcu_reader *reader) +{ + assert(reader->level == 0); + reader->linked = false; +} + +static void +rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + struct rcu_window *window; + struct rcu_data *data; + unsigned int wid; + + if (rcu_reader_linked(reader)) { + return; + } + + data = &rcu_data; + wid = rcu_cpu_data_get_reader_wid(cpu_data); + window = rcu_data_get_window(data, wid); + + rcu_reader_link(reader, cpu_data); + rcu_window_ref(window); + + syscnt_inc(&cpu_data->sc_nr_detected_readers); +} + +void +rcu_reader_leave(struct rcu_reader *reader) +{ + struct rcu_window *window; + struct rcu_data *data; + + data = &rcu_data; + + window = rcu_data_get_window(data, reader->wid); + rcu_window_unref(window); + rcu_reader_unlink(reader); +} + +static void +rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + if (rcu_reader_in_cs(reader)) { + rcu_reader_enter(reader, cpu_data); + } +} + +static void +rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + cpu_data->work_wid++; +} + +static void +rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data); + cpu_data->reader_wid++; +} + +static void +rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data) +{ + enum rcu_gp_state local_gp_state, global_gp_state; + struct rcu_data *data; + bool diff; + + data = &rcu_data; + + for (;;) { + local_gp_state = cpu_data->gp_state; + diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state); + + if (!diff) { + break; + } + + switch (global_gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + rcu_cpu_data_flip_work_wid(cpu_data); + rcu_data_ack_cpu(data); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + rcu_cpu_data_flip_reader_wid(cpu_data); + rcu_data_ack_cpu(data); + break; + case RCU_GP_STATE_WORK_FLUSH: + rcu_cpu_data_flush(cpu_data); + rcu_data_ack_cpu(data); + break; + default: + panic("rcu: invalid grace period state"); + } + + cpu_data->gp_state = global_gp_state; + } +} + +void +rcu_report_context_switch(struct rcu_reader *reader) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + /* + * Most readers don't need to be accounted for because their execution + * doesn't overlap with a grace period. If a reader is preempted however, + * it must be accounted in case a grace period starts while the reader + * is preempted. Accounting also occurs when a grace period starts, and + * more exactly, when the reader window ID of a processor is flipped. + */ + rcu_reader_account(reader, rcu_get_cpu_data()); +} + +void +rcu_report_periodic_event(void) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + rcu_cpu_data_check_gp_state(rcu_get_cpu_data()); +} + +void +rcu_defer(struct work *work) +{ + struct rcu_cpu_data *cpu_data; + unsigned long flags; + + thread_preempt_disable_intr_save(&flags); + cpu_data = rcu_get_cpu_data(); + rcu_cpu_data_queue(cpu_data, work); + thread_preempt_enable_intr_restore(flags); +} + +static void +rcu_waiter_wakeup(struct work *work) +{ + struct rcu_waiter *waiter; + + waiter = structof(work, struct rcu_waiter, work); + + spinlock_lock(&waiter->lock); + waiter->done = true; + thread_wakeup(waiter->thread); + spinlock_unlock(&waiter->lock); +} + +static void +rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread) +{ + work_init(&waiter->work, rcu_waiter_wakeup); + spinlock_init(&waiter->lock); + waiter->thread = thread; + waiter->done = false; +} + +static void +rcu_waiter_wait(struct rcu_waiter *waiter) +{ + rcu_defer(&waiter->work); + + spinlock_lock(&waiter->lock); + + while (!waiter->done) { + thread_sleep(&waiter->lock, waiter, "rcu_wait"); + } + + spinlock_unlock(&waiter->lock); +} + +void +rcu_wait(void) +{ + struct rcu_waiter waiter; + + rcu_waiter_init(&waiter, thread_self()), + rcu_waiter_wait(&waiter); +} + +static int __init +rcu_setup(void) +{ + rcu_data_init(&rcu_data); + + for (size_t i = 0; i < cpu_count(); i++) { + rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i); + } + + return 0; +} + +INIT_OP_DEFINE(rcu_setup, + INIT_OP_DEP(clock_setup, true), + INIT_OP_DEP(cpu_mp_probe, true), + INIT_OP_DEP(spinlock_setup, true), + INIT_OP_DEP(sref_setup, true), + INIT_OP_DEP(syscnt_setup, true), + INIT_OP_DEP(thread_bootstrap, true), + INIT_OP_DEP(timer_setup, true), + INIT_OP_DEP(work_setup, true)); diff --git a/kern/rcu.h b/kern/rcu.h new file mode 100644 index 0000000..1ddae7d --- /dev/null +++ b/kern/rcu.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * Read-Copy Update. + * + * This module provides a synchronization framework for read-only lockless + * operations, best used on read-mostly data structures. + * + * To learn more about RCU, see "Is Parallel Programming Hard, And, If So, + * What Can You Do About It ?" by Paul E. McKenney. + * + * A thorough discussion of RCU's requirements can be found at [1]. + * However, the memory model and terminology used in that document being + * different from C11, the memory barrier guarantees are hereby translated + * into memory ordering guarantees : + * + * 1. Release-acquire ordering is enforced between the end of all + * read-side critical sections starting before a grace period starts + * and the start of all works deferred before the same grace period + * starts. + * 2. Release-acquire ordering is enforced between the start of all + * read-side critical sections ending after a grace period ends + * and all the work deferrals done before the same grace period + * starts. + * 3. Release-acquire ordering is enforced between all the work deferrals + * done before a grace period starts and the start of all works deferred + * before the same grace period starts. + * + * [1] https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.html. + */ + +#ifndef _KERN_RCU_H +#define _KERN_RCU_H + +#include <kern/atomic.h> +#include <kern/init.h> +#include <kern/rcu_i.h> +#include <kern/rcu_types.h> +#include <kern/thread.h> +#include <kern/work.h> + +/* + * Thread-local RCU data. + */ +struct rcu_reader; + +/* + * Safely store a pointer. + * + * This macro enforces release ordering on the given pointer. + */ +#define rcu_store_ptr(ptr, value) atomic_store(&(ptr), value, ATOMIC_RELEASE) + +/* + * Safely load a pointer. + * + * This macro enforces consume ordering on the given pointer. + */ +#define rcu_load_ptr(ptr) atomic_load(&(ptr), ATOMIC_CONSUME) + +/* + * Read-side critical section functions. + * + * Critical sections are preemptible, may safely nest, and may safely + * be used in interrupt context. It is not allowed to sleep inside a + * read-side critical section. However, it is allowed to acquire locks + * that don't sleep, such as spin locks. + * + * Entering or leaving a critical section doesn't provide any memory + * ordering guarantee. + */ + +static inline void +rcu_read_enter(void) +{ + rcu_reader_inc(thread_rcu_reader(thread_self())); + barrier(); +} + +static inline void +rcu_read_leave(void) +{ + barrier(); + rcu_reader_dec(thread_rcu_reader(thread_self())); +} + +/* + * Initialize a RCU reader. + */ +void rcu_reader_init(struct rcu_reader *reader); + +/* + * Report a context switch on the current processor. + * + * The argument is the RCU reader of the preempted thread. + * + * Interrupts and preemption must be disabled when calling this function. + */ +void rcu_report_context_switch(struct rcu_reader *reader); + +/* + * Report a periodic event on the current processor. + * + * Interrupts and preemption must be disabled when calling this function. + */ +void rcu_report_periodic_event(void); + +/* + * Defer a work until all existing read-side references are dropped, + * without blocking. + * + * The given work is scheduled when all existing read-side references + * are dropped. + */ +void rcu_defer(struct work *work); + +/* + * Wait for all existing read-side references to be dropped. + * + * This function sleeps, and may do so for a moderately long duration, + * at leat a few system timer ticks, sometimes a lot more. + */ +void rcu_wait(void); + +/* + * This init operation provides : + * - read-side critical sections may be used + * - module fully initialized + */ +INIT_OP_DECLARE(rcu_setup); + +#endif /* _KERN_RCU_H */ diff --git a/kern/rcu_i.h b/kern/rcu_i.h new file mode 100644 index 0000000..6be933d --- /dev/null +++ b/kern/rcu_i.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _KERN_RCU_I_H +#define _KERN_RCU_I_H + +#include <assert.h> +#include <stdbool.h> + +#include <kern/macros.h> +#include <kern/rcu_types.h> +#include <kern/thread.h> + +void rcu_reader_leave(struct rcu_reader *reader); + +static inline bool +rcu_reader_in_cs(const struct rcu_reader *reader) +{ + return reader->level != 0; +} + +static inline bool +rcu_reader_linked(const struct rcu_reader *reader) +{ + assert(reader == thread_rcu_reader(thread_self())); + return reader->linked; +} + +static inline void +rcu_reader_inc(struct rcu_reader *reader) +{ + reader->level++; + assert(reader->level != 0); +} + +static inline void +rcu_reader_dec(struct rcu_reader *reader) +{ + assert(reader->level != 0); + reader->level--; + + if (unlikely(!rcu_reader_in_cs(reader) && rcu_reader_linked(reader))) { + rcu_reader_leave(reader); + } +} + +#endif /* _KERN_RCU_I_H */ diff --git a/kern/rcu_types.h b/kern/rcu_types.h new file mode 100644 index 0000000..12b8913 --- /dev/null +++ b/kern/rcu_types.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * Isolated type definition used to avoid inclusion circular dependencies. + */ + +#ifndef _KERN_RCU_TYPES_H +#define _KERN_RCU_TYPES_H + +#include <stdbool.h> + +/* + * Thread-local data used to track threads running read-side critical + * sections. + * + * The window ID is valid if and only if the reader is linked. + * + * Interrupts and preemption must be disabled when accessing a reader. + */ +struct rcu_reader { + unsigned int level; + unsigned int wid; + bool linked; +}; + +#endif /* _KERN_RCU_TYPES_H */ diff --git a/kern/thread.c b/kern/thread.c index 20378ff..8460f3b 100644 --- a/kern/thread.c +++ b/kern/thread.c @@ -101,6 +101,7 @@ #include <kern/macros.h> #include <kern/panic.h> #include <kern/percpu.h> +#include <kern/rcu.h> #include <kern/shell.h> #include <kern/sleepq.h> #include <kern/spinlock.h> @@ -641,6 +642,8 @@ thread_runq_schedule(struct thread_runq *runq) assert(next->preempt_level == THREAD_SUSPEND_PREEMPT_LEVEL); if (likely(prev != next)) { + rcu_report_context_switch(thread_rcu_reader(prev)); + /* * That's where the true context switch occurs. The next thread must * unlock the run queue and reenable preemption. Note that unlocking @@ -1700,6 +1703,7 @@ thread_init_booter(unsigned int cpu) booter->flags = 0; booter->intr_level = 0; booter->preempt_level = 1; + rcu_reader_init(&booter->rcu_reader); cpumap_fill(&booter->cpumap); thread_set_user_sched_policy(booter, THREAD_SCHED_POLICY_IDLE); thread_set_user_sched_class(booter, THREAD_SCHED_CLASS_IDLE); @@ -1828,6 +1832,7 @@ thread_init(struct thread *thread, void *stack, thread->pin_level = 0; thread->intr_level = 0; thread->llsync_level = 0; + rcu_reader_init(&thread->rcu_reader); cpumap_copy(&thread->cpumap, cpumap); thread_set_user_sched_policy(thread, attr->policy); thread_set_user_sched_class(thread, thread_policy_to_class(attr->policy)); @@ -2108,7 +2113,6 @@ static void thread_idle(void *arg) { struct thread *self; - int error; (void)arg; @@ -2116,13 +2120,6 @@ thread_idle(void *arg) for (;;) { thread_preempt_disable(); - error = sref_unregister(); - - if (error) { - assert(error == ERROR_BUSY); - goto error_sref; - } - llsync_unregister(); for (;;) { @@ -2137,9 +2134,6 @@ thread_idle(void *arg) } llsync_register(); - sref_register(); - -error_sref: thread_preempt_enable(); } } diff --git a/kern/thread.h b/kern/thread.h index 58322bd..c4dbd5d 100644 --- a/kern/thread.h +++ b/kern/thread.h @@ -679,6 +679,16 @@ thread_llsync_read_dec(void) } /* + * RCU functions. + */ + +static inline struct rcu_reader * +thread_rcu_reader(struct thread *thread) +{ + return &thread->rcu_reader; +} + +/* * Type for thread-specific data destructor. */ typedef void (*thread_dtor_fn_t)(void *); diff --git a/kern/thread_i.h b/kern/thread_i.h index 96ce100..979d4b0 100644 --- a/kern/thread_i.h +++ b/kern/thread_i.h @@ -24,6 +24,7 @@ #include <kern/atomic.h> #include <kern/cpumap.h> #include <kern/list_types.h> +#include <kern/rcu_types.h> #include <kern/spinlock_types.h> #include <kern/turnstile_types.h> #include <machine/cpu.h> @@ -137,6 +138,8 @@ struct thread { /* Read-side critical section level, not in any if 0 */ unsigned short llsync_level; /* (-) */ + struct rcu_reader rcu_reader; /* (-) */ + /* Processors on which this thread is allowed to run */ struct cpumap cpumap; /* (r) */ diff --git a/test/Kconfig b/test/Kconfig index 8f3ae8b..6a6221b 100644 --- a/test/Kconfig +++ b/test/Kconfig @@ -22,6 +22,9 @@ config TEST_MODULE_MUTEX_PI config TEST_MODULE_PMAP_UPDATE_MP bool "pmap_update_mp" +config TEST_MODULE_RCU_DEFER + bool "rcu_defer" + config TEST_MODULE_SREF_DIRTY_ZEROES bool "sref_dirty_zeroes" diff --git a/test/Makefile b/test/Makefile index 098cecb..b7ac3d7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -2,6 +2,7 @@ x15_SOURCES-$(CONFIG_TEST_MODULE_LLSYNC_DEFER) += test/test_llsync_defe x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX) += test/test_mutex.c x15_SOURCES-$(CONFIG_TEST_MODULE_MUTEX_PI) += test/test_mutex_pi.c x15_SOURCES-$(CONFIG_TEST_MODULE_PMAP_UPDATE_MP) += test/test_pmap_update_mp.c +x15_SOURCES-$(CONFIG_TEST_MODULE_RCU_DEFER) += test/test_rcu_defer.c x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_DIRTY_ZEROES) += test/test_sref_dirty_zeroes.c x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_NOREF) += test/test_sref_noref.c x15_SOURCES-$(CONFIG_TEST_MODULE_SREF_WEAKREF) += test/test_sref_weakref.c diff --git a/test/test_rcu_defer.c b/test/test_rcu_defer.c new file mode 100644 index 0000000..e1ab0e5 --- /dev/null +++ b/test/test_rcu_defer.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2014-2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * This test module is a stress test, expected to never terminate, of the + * work deferring functionality of the rcu module. It creates three + * threads, a producer, a consumer, and a reader. The producer allocates + * a page and writes it. It then transfers the page to the consumer, using + * the rcu interface to update the global page pointer. Once at the + * consumer, the rcu interface is used to defer the release of the page. + * Concurrently, the reader accesses the page and checks its content when + * available. These accesses are performed inside a read-side critical + * section and should therefore never fail. + * + * Each thread regularly prints a string to report that it's making progress. + */ + +#include <stddef.h> +#include <stdio.h> +#include <string.h> + +#include <kern/condition.h> +#include <kern/cpumap.h> +#include <kern/error.h> +#include <kern/kmem.h> +#include <kern/macros.h> +#include <kern/mutex.h> +#include <kern/panic.h> +#include <kern/rcu.h> +#include <kern/thread.h> +#include <kern/work.h> +#include <machine/page.h> +#include <test/test.h> +#include <vm/vm_kmem.h> + +#define TEST_LOOPS_PER_PRINT 100000 + +struct test_pdsc { + struct work work; + void *addr; +}; + +#define TEST_VALIDATION_BYTE 0xab + +static struct mutex test_lock; +static struct condition test_condition; +static struct test_pdsc *test_pdsc; + +static struct kmem_cache test_pdsc_cache; + +static void +test_alloc(void *arg) +{ + struct test_pdsc *pdsc; + unsigned long nr_loops; + + (void)arg; + + nr_loops = 0; + + mutex_lock(&test_lock); + + for (;;) { + while (test_pdsc != NULL) { + condition_wait(&test_condition, &test_lock); + } + + pdsc = kmem_cache_alloc(&test_pdsc_cache); + + if (pdsc != NULL) { + pdsc->addr = vm_kmem_alloc(PAGE_SIZE); + + if (pdsc->addr != NULL) { + memset(pdsc->addr, TEST_VALIDATION_BYTE, PAGE_SIZE); + } + } + + rcu_store_ptr(test_pdsc, pdsc); + condition_signal(&test_condition); + + if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) { + printf("alloc "); + } + + nr_loops++; + } +} + +static void +test_deferred_free(struct work *work) +{ + struct test_pdsc *pdsc; + + pdsc = structof(work, struct test_pdsc, work); + + if (pdsc->addr != NULL) { + vm_kmem_free(pdsc->addr, PAGE_SIZE); + } + + kmem_cache_free(&test_pdsc_cache, pdsc); +} + +static void +test_free(void *arg) +{ + struct test_pdsc *pdsc; + unsigned long nr_loops; + + (void)arg; + + nr_loops = 0; + + mutex_lock(&test_lock); + + for (;;) { + while (test_pdsc == NULL) { + condition_wait(&test_condition, &test_lock); + } + + pdsc = test_pdsc; + rcu_store_ptr(test_pdsc, NULL); + + if (pdsc != NULL) { + work_init(&pdsc->work, test_deferred_free); + rcu_defer(&pdsc->work); + } + + condition_signal(&test_condition); + + if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) { + printf("free "); + } + + nr_loops++; + } +} + +static void +test_read(void *arg) +{ + const struct test_pdsc *pdsc; + const unsigned char *s; + unsigned long nr_loops; + + (void)arg; + + nr_loops = 0; + + for (;;) { + rcu_read_enter(); + + pdsc = rcu_load_ptr(test_pdsc); + + if (pdsc != NULL) { + s = (const unsigned char *)pdsc->addr; + + if (s != NULL) { + for (unsigned int i = 0; i < PAGE_SIZE; i++) { + if (s[i] != TEST_VALIDATION_BYTE) { + panic("invalid content"); + } + } + + if ((nr_loops % TEST_LOOPS_PER_PRINT) == 0) { + printf("read "); + } + + nr_loops++; + } + } + + rcu_read_leave(); + } +} + +void +test_setup(void) +{ + struct thread_attr attr; + struct thread *thread; + struct cpumap *cpumap; + int error; + + condition_init(&test_condition); + mutex_init(&test_lock); + + error = cpumap_create(&cpumap); + error_check(error, "cpumap_create"); + + kmem_cache_init(&test_pdsc_cache, "test_pdsc", + sizeof(struct test_pdsc), 0, NULL, 0); + + thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_alloc"); + thread_attr_set_detached(&attr); + cpumap_zero(cpumap); + cpumap_set(cpumap, cpu_count() - 1); + thread_attr_set_cpumap(&attr, cpumap); + error = thread_create(&thread, &attr, test_alloc, NULL); + error_check(error, "thread_create"); + + thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_free"); + thread_attr_set_detached(&attr); + cpumap_zero(cpumap); + cpumap_set(cpumap, cpu_count() - 1); + thread_attr_set_cpumap(&attr, cpumap); + error = thread_create(&thread, &attr, test_free, NULL); + error_check(error, "thread_create"); + + thread_attr_init(&attr, THREAD_KERNEL_PREFIX "test_read"); + thread_attr_set_detached(&attr); + cpumap_zero(cpumap); + cpumap_set(cpumap, 0); + thread_attr_set_cpumap(&attr, cpumap); + error = thread_create(&thread, &attr, test_read, NULL); + error_check(error, "thread_create"); + + cpumap_destroy(cpumap); +} |