diff options
author | Richard Braun <rbraun@sceen.net> | 2018-02-20 20:46:37 +0100 |
---|---|---|
committer | Richard Braun <rbraun@sceen.net> | 2018-02-20 20:46:37 +0100 |
commit | da8eb9c244d27fd042adde6234ccec079681d7f4 (patch) | |
tree | f0de4b67e3a59dded6598efe3db87c835da72b92 /kern/rcu.c | |
parent | ed9ada07d9c841b3f3f3a3f841aae2792a30779a (diff) | |
parent | 22dff6b7a6839e77d713d671cb2038e56b64ac16 (diff) |
Merge branch 'preemptible_rcu'
Diffstat (limited to 'kern/rcu.c')
-rw-r--r-- | kern/rcu.c | 772 |
1 files changed, 772 insertions, 0 deletions
diff --git a/kern/rcu.c b/kern/rcu.c new file mode 100644 index 0000000..82c62df --- /dev/null +++ b/kern/rcu.c @@ -0,0 +1,772 @@ +/* + * Copyright (c) 2018 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * This implementation is based on the paper "Extending RCU for Realtime + * and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma, + * and Suparna Bhattacharya. Beside the mechanisms not implemented yet, + * such as priority boosting, the differences are described below. + * + * First, this implementation uses scalable reference counters provided + * by the sref module instead of per-CPU counters as described in the paper. + * The main benefit of this approach is the centralization of most scalability + * improvements in the sref module, which should propagate to all sref users, + * including RCU. + * + * In addition, this implementation introduces the concept of windows, where + * a window is a range in time to which readers may be linked. Here, a + * grace period is defined as the time range at the end of a window where + * various synchronization steps are performed to enforce the RCU guarantees. + * The minimum duration of a window acts as a knob allowing users to tune + * the behavior of the RCU system. + * + * Finally, the state machine described in the paper is updated to accommodate + * for windows, since grace periods don't run back-to-back to each other. + * Windows are regularly checked and flipped if the previous one isn't + * active any more. From that moment, processors may notice the global flip + * and perform a local flip of their work window ID. Once all processors + * have acknowleged the flip, it is certain that no new work may be queued + * on the previous window. At this point, the same occurs for the + * processor-local reader window ID, and once all processors have + * acknowleged that flip, there can be no new reader linked to the previous + * window. The RCU system then releases its own reference to the previous + * window and waits for the window reference counter to drop to 0, indicating + * that all readers linked to the previous window have left their read-side + * critical section. When this global event occurs, processors are requested + * to flush the works queued for the previous window, and once they all have + * acknowleged their flush, the window ends and becomes inactive, allowing + * a new grace period to occur later on. + * + * Here is an informal diagram describing this process : + * + * t ----> + * + * reader window flip ---+ +--- no more readers + * work window flip ------+ | | +- works flushed + * (grace period start) | | | | (grace period / window end) + * v v v v + * +--------------+-+-----+-+ + * | . . . | + * | window 0 . . gp . | + * | removal . . . | reclamation + * +--------------+-+-----+-+-----+----+ + * | . | + * | window 1 . gp | + * | removal . | reclamation + * +---------------+----+-------- + * | + * | window 2 ... + * | + * +------------- + * + * TODO Improve atomic acknowledgment scalability. + * TODO Handle large amounts of deferred works. + * TODO Priority boosting of slow readers. + * TODO CPU registration for dyntick-friendly behavior. + */ + +#include <assert.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> + +#include <kern/atomic.h> +#include <kern/clock.h> +#include <kern/init.h> +#include <kern/macros.h> +#include <kern/rcu.h> +#include <kern/panic.h> +#include <kern/percpu.h> +#include <kern/spinlock.h> +#include <kern/sref.h> +#include <kern/syscnt.h> +#include <kern/thread.h> +#include <kern/timer.h> +#include <kern/work.h> +#include <machine/cpu.h> + +/* + * Negative close to 0 so that an overflow occurs early. + */ +#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500) + +/* + * Interval between window checking. + * + * When windows are checked, a flip occurs if the previous window isn't + * active any more. + */ +#define RCU_WINDOW_CHECK_INTERVAL_MS 10 + +/* + * Grace period states. + * + * These states are only used to trigger per-CPU processing that is + * globally acknowleged by decrementing a global atomic counter. They + * do not completely represent the actual state of a grace period. + */ +enum rcu_gp_state { + RCU_GP_STATE_WORK_WINDOW_FLIP, + RCU_GP_STATE_READER_WINDOW_FLIP, + RCU_GP_STATE_WORK_FLUSH, +}; + +/* + * Per-CPU view of a window. + * + * Deferred works are scheduled when the window ends. + */ +struct rcu_cpu_window { + struct work_queue works; +}; + +/* + * Per-CPU RCU data. + * + * Each processor maintains two local window IDs. One is used as the current + * window ID when deferring work, the other when detecting a reader. A local + * flip occurs when a processor notices that the global grace period state + * no longer matches the local grace period state. These checks only occur + * on periodic events. + * + * Interrupts and preemption must be disabled when accessing local CPU data. + */ +struct rcu_cpu_data { + enum rcu_gp_state gp_state; + unsigned int work_wid; + unsigned int reader_wid; + struct rcu_cpu_window windows[2]; + struct syscnt sc_nr_detected_readers; +}; + +/* + * Global window. + * + * A window is a time range that tracks read-side references. Conceptually, + * each reader adds a reference to the current window. In practice, references + * are only added when readers are detected, which occurs on a context switch + * (to track preempted threads) or a reader window flip (to prevent currently + * running readers to be linked to the next window). + * + * When a window is started, its scalable reference counter is initialized + * with a reference owned by the RCU system. That reference guarantees that + * the window remains active as long as new readers may add references, + * since it prevents the counter from dropping to 0. After a reader window + * flip, there may not be new references to the window, and the initial + * reference is dropped, allowing the counter to reach 0 once all detected + * readers leave their critical section and unreference the window they're + * linked to. + */ +struct rcu_window { + struct sref_counter nr_refs; + uint64_t start_ts; + bool active; +}; + +/* + * Global data. + * + * Processors regularly check the grace period state against their own, + * locally cached grace period state, and take action whenever they differ. + * False sharing is avoided by making the global grace period state fill an + * entire cache line on SMP. + * + * After processors notice a grace period state change, they acknowledge + * noticing this change by decrementing the atomic acknowledgment counter, + * which also fills a complete cache line on SMP in order to restrict cache + * line bouncing. Atomic operations on this counter are done with + * acquire-release ordering to enforce the memory ordering guarantees + * required by the implementation, as well as those provided by the public + * interface. + * + * In addition to the global window ID and the windows themselves, the data + * include a timer, used to trigger the end of windows, i.e. grace periods. + * Since the timer function, atomic acknowledgments, and window no-reference + * function chain each other, there is currently no need for a global lock. + */ +struct rcu_data { + struct { + alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state; + }; + struct { + alignas(CPU_L1_SIZE) unsigned int nr_acks; + }; + + unsigned int wid; + struct rcu_window windows[2]; + struct timer timer; + struct syscnt sc_nr_windows; + struct syscnt sc_last_window_ms; + struct syscnt sc_longest_window_ms; +}; + +/* + * Structure used to implement rcu_wait(). + */ +struct rcu_waiter { + struct work work; + struct spinlock lock; + struct thread *thread; + bool done; +}; + +static struct rcu_data rcu_data; +static struct rcu_cpu_data rcu_cpu_data __percpu; + +static struct rcu_cpu_data * +rcu_get_cpu_data(void) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + return cpu_local_ptr(rcu_cpu_data); +} + +static enum rcu_gp_state +rcu_data_get_gp_state(const struct rcu_data *data) +{ + return data->gp_state; +} + +static unsigned int +rcu_data_get_wid(const struct rcu_data *data) +{ + return data->wid; +} + +static struct rcu_window * +rcu_data_get_window_from_index(struct rcu_data *data, size_t index) +{ + assert(index < ARRAY_SIZE(data->windows)); + return &data->windows[index]; +} + +static struct rcu_window * +rcu_data_get_window(struct rcu_data *data, unsigned int wid) +{ + return rcu_data_get_window_from_index(data, wid & 1); +} + +static void +rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state) +{ + assert(data->nr_acks == 0); + + switch (gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP); + break; + case RCU_GP_STATE_WORK_FLUSH: + assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP); + break; + default: + panic("rcu: invalid grace period state"); + } + + data->nr_acks = cpu_count(); + atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE); +} + +static bool +rcu_data_check_gp_state(const struct rcu_data *data, + enum rcu_gp_state local_gp_state, + enum rcu_gp_state *global_gp_state) +{ + *global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED); + + if (unlikely(local_gp_state != *global_gp_state)) { + atomic_fence_acquire(); + return true; + } + + return false; +} + +static void +rcu_window_end(struct rcu_window *window) +{ + assert(window->active); + window->active = false; +} + +static void +rcu_window_ref(struct rcu_window *window) +{ + sref_counter_inc(&window->nr_refs); +} + +static void +rcu_window_unref(struct rcu_window *window) +{ + sref_counter_dec(&window->nr_refs); +} + +static uint64_t +rcu_window_get_start_ts(const struct rcu_window *window) +{ + return window->start_ts; +} + +static void +rcu_window_flush(struct sref_counter *counter) +{ + (void)counter; + + rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH); +} + +static void __init +rcu_window_init(struct rcu_window *window) +{ + window->active = false; +} + +static void +rcu_window_start(struct rcu_window *window) +{ + assert(!window->active); + + sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush); + window->start_ts = clock_get_time(); + window->active = true; +} + +static bool +rcu_window_active(const struct rcu_window *window) +{ + return window->active; +} + +static void +rcu_data_end_prev_window(struct rcu_data *data, uint64_t now) +{ + struct rcu_window *window; + uint64_t duration; + + window = rcu_data_get_window(data, data->wid - 1); + duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window)); + syscnt_set(&data->sc_last_window_ms, duration); + + if (duration > syscnt_read(&data->sc_longest_window_ms)) { + syscnt_set(&data->sc_longest_window_ms, duration); + } + + rcu_window_end(window); +} + +static void +rcu_data_schedule_timer(struct rcu_data *data, uint64_t now) +{ + uint64_t ticks; + + ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS); + timer_schedule(&data->timer, now + ticks); +} + +static void +rcu_data_ack_cpu(struct rcu_data *data) +{ + struct rcu_window *window; + unsigned int prev_nr_acks; + uint64_t now; + + prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL); + + if (prev_nr_acks != 1) { + assert(prev_nr_acks != 0); + return; + } + + switch (data->gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + window = rcu_data_get_window(data, data->wid - 1); + rcu_window_unref(window); + break; + case RCU_GP_STATE_WORK_FLUSH: + now = clock_get_time(); + rcu_data_end_prev_window(data, now); + rcu_data_schedule_timer(data, now); + break; + default: + panic("rcu: invalid grace period state"); + } +} + +static bool +rcu_data_flip_windows(struct rcu_data *data) +{ + struct rcu_window *window; + + window = rcu_data_get_window(data, data->wid - 1); + + if (rcu_window_active(window)) { + return false; + } + + rcu_window_start(window); + syscnt_inc(&data->sc_nr_windows); + data->wid++; + rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP); + return true; +} + +static void +rcu_data_check_windows(struct timer *timer) +{ + struct rcu_data *data; + bool flipped; + + data = &rcu_data; + flipped = rcu_data_flip_windows(data); + + if (!flipped) { + rcu_data_schedule_timer(data, timer_get_time(timer)); + } +} + +static void __init +rcu_data_init(struct rcu_data *data) +{ + data->gp_state = RCU_GP_STATE_WORK_FLUSH; + data->nr_acks = 0; + data->wid = RCU_WINDOW_ID_INIT_VALUE; + + for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) { + rcu_window_init(rcu_data_get_window_from_index(data, i)); + } + + rcu_window_start(rcu_data_get_window(data, data->wid)); + + timer_init(&data->timer, rcu_data_check_windows, 0); + rcu_data_schedule_timer(data, clock_get_time()); + + syscnt_register(&data->sc_nr_windows, "rcu_nr_windows"); + syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms"); + syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms"); +} + +static void __init +rcu_cpu_window_init(struct rcu_cpu_window *cpu_window) +{ + work_queue_init(&cpu_window->works); +} + +static void +rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work) +{ + work_queue_push(&cpu_window->works, work); +} + +static void +rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window) +{ + work_queue_schedule(&cpu_window->works, 0); + work_queue_init(&cpu_window->works); +} + +static unsigned int +rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data) +{ + return cpu_data->reader_wid; +} + +static struct rcu_cpu_window * +rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index) +{ + assert(index < ARRAY_SIZE(cpu_data->windows)); + return &cpu_data->windows[index]; +} + +static struct rcu_cpu_window * +rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid) +{ + return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1); +} + +static void __init +rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu) +{ + struct rcu_data *data; + char name[SYSCNT_NAME_SIZE]; + + data = &rcu_data; + + cpu_data->gp_state = rcu_data_get_gp_state(data); + cpu_data->work_wid = rcu_data_get_wid(data); + cpu_data->reader_wid = cpu_data->work_wid; + + for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) { + rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i)); + } + + snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu); + syscnt_register(&cpu_data->sc_nr_detected_readers, name); +} + +static void +rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work) +{ + struct rcu_cpu_window *cpu_window; + + cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid); + rcu_cpu_window_queue(cpu_window, work); +} + +static void +rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data) +{ + struct rcu_cpu_window *cpu_window; + + assert(cpu_data->work_wid == cpu_data->reader_wid); + + cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1); + rcu_cpu_window_flush(cpu_window); +} + +void +rcu_reader_init(struct rcu_reader *reader) +{ + reader->level = 0; + reader->linked = false; +} + +static void +rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(reader == thread_rcu_reader(thread_self())); + assert(!rcu_reader_linked(reader)); + + reader->wid = rcu_cpu_data_get_reader_wid(cpu_data); + reader->linked = true; +} + +static void +rcu_reader_unlink(struct rcu_reader *reader) +{ + assert(reader->level == 0); + reader->linked = false; +} + +static void +rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + struct rcu_window *window; + struct rcu_data *data; + unsigned int wid; + + if (rcu_reader_linked(reader)) { + return; + } + + data = &rcu_data; + wid = rcu_cpu_data_get_reader_wid(cpu_data); + window = rcu_data_get_window(data, wid); + + rcu_reader_link(reader, cpu_data); + rcu_window_ref(window); + + syscnt_inc(&cpu_data->sc_nr_detected_readers); +} + +void +rcu_reader_leave(struct rcu_reader *reader) +{ + struct rcu_window *window; + struct rcu_data *data; + + data = &rcu_data; + + window = rcu_data_get_window(data, reader->wid); + rcu_window_unref(window); + rcu_reader_unlink(reader); +} + +static void +rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data) +{ + if (rcu_reader_in_cs(reader)) { + rcu_reader_enter(reader, cpu_data); + } +} + +static void +rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + cpu_data->work_wid++; +} + +static void +rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data); + cpu_data->reader_wid++; +} + +static void +rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data) +{ + enum rcu_gp_state local_gp_state, global_gp_state; + struct rcu_data *data; + bool diff; + + data = &rcu_data; + + for (;;) { + local_gp_state = cpu_data->gp_state; + diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state); + + if (!diff) { + break; + } + + switch (global_gp_state) { + case RCU_GP_STATE_WORK_WINDOW_FLIP: + rcu_cpu_data_flip_work_wid(cpu_data); + rcu_data_ack_cpu(data); + break; + case RCU_GP_STATE_READER_WINDOW_FLIP: + rcu_cpu_data_flip_reader_wid(cpu_data); + rcu_data_ack_cpu(data); + break; + case RCU_GP_STATE_WORK_FLUSH: + rcu_cpu_data_flush(cpu_data); + rcu_data_ack_cpu(data); + break; + default: + panic("rcu: invalid grace period state"); + } + + cpu_data->gp_state = global_gp_state; + } +} + +void +rcu_report_context_switch(struct rcu_reader *reader) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + /* + * Most readers don't need to be accounted for because their execution + * doesn't overlap with a grace period. If a reader is preempted however, + * it must be accounted in case a grace period starts while the reader + * is preempted. Accounting also occurs when a grace period starts, and + * more exactly, when the reader window ID of a processor is flipped. + */ + rcu_reader_account(reader, rcu_get_cpu_data()); +} + +void +rcu_report_periodic_event(void) +{ + assert(!cpu_intr_enabled()); + assert(!thread_preempt_enabled()); + + rcu_cpu_data_check_gp_state(rcu_get_cpu_data()); +} + +void +rcu_defer(struct work *work) +{ + struct rcu_cpu_data *cpu_data; + unsigned long flags; + + thread_preempt_disable_intr_save(&flags); + cpu_data = rcu_get_cpu_data(); + rcu_cpu_data_queue(cpu_data, work); + thread_preempt_enable_intr_restore(flags); +} + +static void +rcu_waiter_wakeup(struct work *work) +{ + struct rcu_waiter *waiter; + + waiter = structof(work, struct rcu_waiter, work); + + spinlock_lock(&waiter->lock); + waiter->done = true; + thread_wakeup(waiter->thread); + spinlock_unlock(&waiter->lock); +} + +static void +rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread) +{ + work_init(&waiter->work, rcu_waiter_wakeup); + spinlock_init(&waiter->lock); + waiter->thread = thread; + waiter->done = false; +} + +static void +rcu_waiter_wait(struct rcu_waiter *waiter) +{ + rcu_defer(&waiter->work); + + spinlock_lock(&waiter->lock); + + while (!waiter->done) { + thread_sleep(&waiter->lock, waiter, "rcu_wait"); + } + + spinlock_unlock(&waiter->lock); +} + +void +rcu_wait(void) +{ + struct rcu_waiter waiter; + + rcu_waiter_init(&waiter, thread_self()), + rcu_waiter_wait(&waiter); +} + +static int __init +rcu_setup(void) +{ + rcu_data_init(&rcu_data); + + for (size_t i = 0; i < cpu_count(); i++) { + rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i); + } + + return 0; +} + +INIT_OP_DEFINE(rcu_setup, + INIT_OP_DEP(clock_setup, true), + INIT_OP_DEP(cpu_mp_probe, true), + INIT_OP_DEP(spinlock_setup, true), + INIT_OP_DEP(sref_setup, true), + INIT_OP_DEP(syscnt_setup, true), + INIT_OP_DEP(thread_bootstrap, true), + INIT_OP_DEP(timer_setup, true), + INIT_OP_DEP(work_setup, true)); |