/*
* Copyright (c) 2018 Richard Braun.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*
* This implementation is based on the paper "Extending RCU for Realtime
* and Embedded Workloads" by Paul E. McKenney, Ingo Molnar, Dipankar Sarma,
* and Suparna Bhattacharya. Beside the mechanisms not implemented yet,
* such as priority boosting, the differences are described below.
*
* First, this implementation uses scalable reference counters provided
* by the sref module instead of per-CPU counters as described in the paper.
* The main benefit of this approach is the centralization of most scalability
* improvements in the sref module, which should propagate to all sref users,
* including RCU.
*
* In addition, this implementation introduces the concept of windows, where
* a window is a range in time to which readers may be linked. Here, a
* grace period is defined as the time range at the end of a window where
* various synchronization steps are performed to enforce the RCU guarantees.
* The minimum duration of a window acts as a knob allowing users to tune
* the behavior of the RCU system.
*
* Finally, the state machine described in the paper is updated to accommodate
* for windows, since grace periods don't run back-to-back to each other.
* Windows are regularly checked and flipped if the previous one isn't
* active any more. From that moment, processors may notice the global flip
* and perform a local flip of their work window ID. Once all processors
* have acknowleged the flip, it is certain that no new work may be queued
* on the previous window. At this point, the same occurs for the
* processor-local reader window ID, and once all processors have
* acknowleged that flip, there can be no new reader linked to the previous
* window. The RCU system then releases its own reference to the previous
* window and waits for the window reference counter to drop to 0, indicating
* that all readers linked to the previous window have left their read-side
* critical section. When this global event occurs, processors are requested
* to flush the works queued for the previous window, and once they all have
* acknowleged their flush, the window ends and becomes inactive, allowing
* a new grace period to occur later on.
*
* Here is an informal diagram describing this process :
*
* t ---->
*
* reader window flip ---+ +--- no more readers
* work window flip ------+ | | +- works flushed
* (grace period start) | | | | (grace period / window end)
* v v v v
* +--------------+-+-----+-+
* | . . . |
* | window 0 . . gp . |
* | removal . . . | reclamation
* +--------------+-+-----+-+-----+----+
* | . |
* | window 1 . gp |
* | removal . | reclamation
* +---------------+----+--------
* |
* | window 2 ...
* |
* +-------------
*
* TODO Improve atomic acknowledgment scalability.
* TODO Handle large amounts of deferred works.
* TODO Priority boosting of slow readers.
* TODO CPU registration for dyntick-friendly behavior.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
* Negative close to 0 so that an overflow occurs early.
*/
#define RCU_WINDOW_ID_INIT_VALUE ((unsigned int)-500)
/*
* Interval between window checking.
*
* When windows are checked, a flip occurs if the previous window isn't
* active any more.
*/
#define RCU_WINDOW_CHECK_INTERVAL_MS 10
/*
* Grace period states.
*
* These states are only used to trigger per-CPU processing that is
* globally acknowleged by decrementing a global atomic counter. They
* do not completely represent the actual state of a grace period.
*/
enum rcu_gp_state {
RCU_GP_STATE_WORK_WINDOW_FLIP,
RCU_GP_STATE_READER_WINDOW_FLIP,
RCU_GP_STATE_WORK_FLUSH,
};
/*
* Per-CPU view of a window.
*
* Deferred works are scheduled when the window ends.
*/
struct rcu_cpu_window {
struct work_queue works;
};
/*
* Per-CPU RCU data.
*
* Each processor maintains two local window IDs. One is used as the current
* window ID when deferring work, the other when detecting a reader. A local
* flip occurs when a processor notices that the global grace period state
* no longer matches the local grace period state. These checks only occur
* on periodic events.
*
* Interrupts and preemption must be disabled when accessing local CPU data.
*/
struct rcu_cpu_data {
enum rcu_gp_state gp_state;
unsigned int work_wid;
unsigned int reader_wid;
struct rcu_cpu_window windows[2];
struct syscnt sc_nr_detected_readers;
};
/*
* Global window.
*
* A window is a time range that tracks read-side references. Conceptually,
* each reader adds a reference to the current window. In practice, references
* are only added when readers are detected, which occurs on a context switch
* (to track preempted threads) or a reader window flip (to prevent currently
* running readers to be linked to the next window).
*
* When a window is started, its scalable reference counter is initialized
* with a reference owned by the RCU system. That reference guarantees that
* the window remains active as long as new readers may add references,
* since it prevents the counter from dropping to 0. After a reader window
* flip, there may not be new references to the window, and the initial
* reference is dropped, allowing the counter to reach 0 once all detected
* readers leave their critical section and unreference the window they're
* linked to.
*/
struct rcu_window {
struct sref_counter nr_refs;
uint64_t start_ts;
bool active;
};
/*
* Global data.
*
* Processors regularly check the grace period state against their own,
* locally cached grace period state, and take action whenever they differ.
* False sharing is avoided by making the global grace period state fill an
* entire cache line on SMP.
*
* After processors notice a grace period state change, they acknowledge
* noticing this change by decrementing the atomic acknowledgment counter,
* which also fills a complete cache line on SMP in order to restrict cache
* line bouncing. Atomic operations on this counter are done with
* acquire-release ordering to enforce the memory ordering guarantees
* required by the implementation, as well as those provided by the public
* interface.
*
* In addition to the global window ID and the windows themselves, the data
* include a timer, used to trigger the end of windows, i.e. grace periods.
* Since the timer function, atomic acknowledgments, and window no-reference
* function chain each other, there is currently no need for a global lock.
*/
struct rcu_data {
struct {
alignas(CPU_L1_SIZE) enum rcu_gp_state gp_state;
};
struct {
alignas(CPU_L1_SIZE) unsigned int nr_acks;
};
unsigned int wid;
struct rcu_window windows[2];
struct timer timer;
struct syscnt sc_nr_windows;
struct syscnt sc_last_window_ms;
struct syscnt sc_longest_window_ms;
};
/*
* Structure used to implement rcu_wait().
*/
struct rcu_waiter {
struct work work;
struct spinlock lock;
struct thread *thread;
bool done;
};
static struct rcu_data rcu_data;
static struct rcu_cpu_data rcu_cpu_data __percpu;
static struct rcu_cpu_data *
rcu_get_cpu_data(void)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
return cpu_local_ptr(rcu_cpu_data);
}
static enum rcu_gp_state
rcu_data_get_gp_state(const struct rcu_data *data)
{
return data->gp_state;
}
static unsigned int
rcu_data_get_wid(const struct rcu_data *data)
{
return data->wid;
}
static struct rcu_window *
rcu_data_get_window_from_index(struct rcu_data *data, size_t index)
{
assert(index < ARRAY_SIZE(data->windows));
return &data->windows[index];
}
static struct rcu_window *
rcu_data_get_window(struct rcu_data *data, unsigned int wid)
{
return rcu_data_get_window_from_index(data, wid & 1);
}
static void
rcu_data_update_gp_state(struct rcu_data *data, enum rcu_gp_state gp_state)
{
assert(data->nr_acks == 0);
switch (gp_state) {
case RCU_GP_STATE_WORK_WINDOW_FLIP:
assert(data->gp_state == RCU_GP_STATE_WORK_FLUSH);
break;
case RCU_GP_STATE_READER_WINDOW_FLIP:
assert(data->gp_state == RCU_GP_STATE_WORK_WINDOW_FLIP);
break;
case RCU_GP_STATE_WORK_FLUSH:
assert(data->gp_state == RCU_GP_STATE_READER_WINDOW_FLIP);
break;
default:
panic("rcu: invalid grace period state");
}
data->nr_acks = cpu_count();
atomic_store(&data->gp_state, gp_state, ATOMIC_RELEASE);
}
static bool
rcu_data_check_gp_state(const struct rcu_data *data,
enum rcu_gp_state local_gp_state,
enum rcu_gp_state *global_gp_state)
{
*global_gp_state = atomic_load(&data->gp_state, ATOMIC_RELAXED);
if (unlikely(local_gp_state != *global_gp_state)) {
atomic_fence_acquire();
return true;
}
return false;
}
static void
rcu_window_end(struct rcu_window *window)
{
assert(window->active);
window->active = false;
}
static void
rcu_window_ref(struct rcu_window *window)
{
sref_counter_inc(&window->nr_refs);
}
static void
rcu_window_unref(struct rcu_window *window)
{
sref_counter_dec(&window->nr_refs);
}
static uint64_t
rcu_window_get_start_ts(const struct rcu_window *window)
{
return window->start_ts;
}
static void
rcu_window_flush(struct sref_counter *counter)
{
(void)counter;
rcu_data_update_gp_state(&rcu_data, RCU_GP_STATE_WORK_FLUSH);
}
static void __init
rcu_window_init(struct rcu_window *window)
{
window->active = false;
}
static void
rcu_window_start(struct rcu_window *window)
{
assert(!window->active);
sref_counter_init(&window->nr_refs, 1, NULL, rcu_window_flush);
window->start_ts = clock_get_time();
window->active = true;
}
static bool
rcu_window_active(const struct rcu_window *window)
{
return window->active;
}
static void
rcu_data_end_prev_window(struct rcu_data *data, uint64_t now)
{
struct rcu_window *window;
uint64_t duration;
window = rcu_data_get_window(data, data->wid - 1);
duration = clock_ticks_to_ms(now - rcu_window_get_start_ts(window));
syscnt_set(&data->sc_last_window_ms, duration);
if (duration > syscnt_read(&data->sc_longest_window_ms)) {
syscnt_set(&data->sc_longest_window_ms, duration);
}
rcu_window_end(window);
}
static void
rcu_data_schedule_timer(struct rcu_data *data, uint64_t now)
{
uint64_t ticks;
ticks = clock_ticks_from_ms(RCU_WINDOW_CHECK_INTERVAL_MS);
timer_schedule(&data->timer, now + ticks);
}
static void
rcu_data_ack_cpu(struct rcu_data *data)
{
struct rcu_window *window;
unsigned int prev_nr_acks;
uint64_t now;
prev_nr_acks = atomic_fetch_sub(&data->nr_acks, 1, ATOMIC_ACQ_REL);
if (prev_nr_acks != 1) {
assert(prev_nr_acks != 0);
return;
}
switch (data->gp_state) {
case RCU_GP_STATE_WORK_WINDOW_FLIP:
rcu_data_update_gp_state(data, RCU_GP_STATE_READER_WINDOW_FLIP);
break;
case RCU_GP_STATE_READER_WINDOW_FLIP:
window = rcu_data_get_window(data, data->wid - 1);
rcu_window_unref(window);
break;
case RCU_GP_STATE_WORK_FLUSH:
now = clock_get_time();
rcu_data_end_prev_window(data, now);
rcu_data_schedule_timer(data, now);
break;
default:
panic("rcu: invalid grace period state");
}
}
static bool
rcu_data_flip_windows(struct rcu_data *data)
{
struct rcu_window *window;
window = rcu_data_get_window(data, data->wid - 1);
if (rcu_window_active(window)) {
return false;
}
rcu_window_start(window);
syscnt_inc(&data->sc_nr_windows);
data->wid++;
rcu_data_update_gp_state(data, RCU_GP_STATE_WORK_WINDOW_FLIP);
return true;
}
static void
rcu_data_check_windows(struct timer *timer)
{
struct rcu_data *data;
bool flipped;
data = &rcu_data;
flipped = rcu_data_flip_windows(data);
if (!flipped) {
rcu_data_schedule_timer(data, timer_get_time(timer));
}
}
static void __init
rcu_data_init(struct rcu_data *data)
{
data->gp_state = RCU_GP_STATE_WORK_FLUSH;
data->nr_acks = 0;
data->wid = RCU_WINDOW_ID_INIT_VALUE;
for (size_t i = 0; i < ARRAY_SIZE(data->windows); i++) {
rcu_window_init(rcu_data_get_window_from_index(data, i));
}
rcu_window_start(rcu_data_get_window(data, data->wid));
timer_init(&data->timer, rcu_data_check_windows, 0);
rcu_data_schedule_timer(data, clock_get_time());
syscnt_register(&data->sc_nr_windows, "rcu_nr_windows");
syscnt_register(&data->sc_last_window_ms, "rcu_last_window_ms");
syscnt_register(&data->sc_longest_window_ms, "rcu_longest_window_ms");
}
static void __init
rcu_cpu_window_init(struct rcu_cpu_window *cpu_window)
{
work_queue_init(&cpu_window->works);
}
static void
rcu_cpu_window_queue(struct rcu_cpu_window *cpu_window, struct work *work)
{
work_queue_push(&cpu_window->works, work);
}
static void
rcu_cpu_window_flush(struct rcu_cpu_window *cpu_window)
{
work_queue_schedule(&cpu_window->works, 0);
work_queue_init(&cpu_window->works);
}
static unsigned int
rcu_cpu_data_get_reader_wid(const struct rcu_cpu_data *cpu_data)
{
return cpu_data->reader_wid;
}
static struct rcu_cpu_window *
rcu_cpu_data_get_window_from_index(struct rcu_cpu_data *cpu_data, size_t index)
{
assert(index < ARRAY_SIZE(cpu_data->windows));
return &cpu_data->windows[index];
}
static struct rcu_cpu_window *
rcu_cpu_data_get_window(struct rcu_cpu_data *cpu_data, unsigned int wid)
{
return rcu_cpu_data_get_window_from_index(cpu_data, wid & 1);
}
static void __init
rcu_cpu_data_init(struct rcu_cpu_data *cpu_data, unsigned int cpu)
{
struct rcu_data *data;
char name[SYSCNT_NAME_SIZE];
data = &rcu_data;
cpu_data->gp_state = rcu_data_get_gp_state(data);
cpu_data->work_wid = rcu_data_get_wid(data);
cpu_data->reader_wid = cpu_data->work_wid;
for (size_t i = 0; i < ARRAY_SIZE(cpu_data->windows); i++) {
rcu_cpu_window_init(rcu_cpu_data_get_window_from_index(cpu_data, i));
}
snprintf(name, sizeof(name), "rcu_nr_detected_readers/%u", cpu);
syscnt_register(&cpu_data->sc_nr_detected_readers, name);
}
static void
rcu_cpu_data_queue(struct rcu_cpu_data *cpu_data, struct work *work)
{
struct rcu_cpu_window *cpu_window;
cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid);
rcu_cpu_window_queue(cpu_window, work);
}
static void
rcu_cpu_data_flush(struct rcu_cpu_data *cpu_data)
{
struct rcu_cpu_window *cpu_window;
assert(cpu_data->work_wid == cpu_data->reader_wid);
cpu_window = rcu_cpu_data_get_window(cpu_data, cpu_data->work_wid - 1);
rcu_cpu_window_flush(cpu_window);
}
void
rcu_reader_init(struct rcu_reader *reader)
{
reader->level = 0;
reader->linked = false;
}
static void
rcu_reader_link(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
{
assert(!cpu_intr_enabled());
assert(reader == thread_rcu_reader(thread_self()));
assert(!rcu_reader_linked(reader));
reader->wid = rcu_cpu_data_get_reader_wid(cpu_data);
reader->linked = true;
}
static void
rcu_reader_unlink(struct rcu_reader *reader)
{
assert(reader->level == 0);
reader->linked = false;
}
static void
rcu_reader_enter(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
{
struct rcu_window *window;
struct rcu_data *data;
unsigned int wid;
if (rcu_reader_linked(reader)) {
return;
}
data = &rcu_data;
wid = rcu_cpu_data_get_reader_wid(cpu_data);
window = rcu_data_get_window(data, wid);
rcu_reader_link(reader, cpu_data);
rcu_window_ref(window);
syscnt_inc(&cpu_data->sc_nr_detected_readers);
}
void
rcu_reader_leave(struct rcu_reader *reader)
{
struct rcu_window *window;
struct rcu_data *data;
data = &rcu_data;
window = rcu_data_get_window(data, reader->wid);
rcu_window_unref(window);
rcu_reader_unlink(reader);
}
static void
rcu_reader_account(struct rcu_reader *reader, struct rcu_cpu_data *cpu_data)
{
if (rcu_reader_in_cs(reader)) {
rcu_reader_enter(reader, cpu_data);
}
}
static void
rcu_cpu_data_flip_work_wid(struct rcu_cpu_data *cpu_data)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
cpu_data->work_wid++;
}
static void
rcu_cpu_data_flip_reader_wid(struct rcu_cpu_data *cpu_data)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
rcu_reader_account(thread_rcu_reader(thread_self()), cpu_data);
cpu_data->reader_wid++;
}
static void
rcu_cpu_data_check_gp_state(struct rcu_cpu_data *cpu_data)
{
enum rcu_gp_state local_gp_state, global_gp_state;
struct rcu_data *data;
bool diff;
data = &rcu_data;
/*
* A loop is used to optimize the case where a processor is the last to
* acknowledge a grace period state change, in which case the latter
* also immediately changes and can be acknowleged right away. As a
* result, this loop may never run more than twice.
*/
for (unsigned int i = 0; /* no condition */; i++) {
local_gp_state = cpu_data->gp_state;
diff = rcu_data_check_gp_state(data, local_gp_state, &global_gp_state);
if (!diff) {
break;
}
assert(i < 2);
switch (global_gp_state) {
case RCU_GP_STATE_WORK_WINDOW_FLIP:
rcu_cpu_data_flip_work_wid(cpu_data);
rcu_data_ack_cpu(data);
break;
case RCU_GP_STATE_READER_WINDOW_FLIP:
rcu_cpu_data_flip_reader_wid(cpu_data);
rcu_data_ack_cpu(data);
break;
case RCU_GP_STATE_WORK_FLUSH:
rcu_cpu_data_flush(cpu_data);
rcu_data_ack_cpu(data);
break;
default:
panic("rcu: invalid grace period state");
}
cpu_data->gp_state = global_gp_state;
}
}
void
rcu_report_context_switch(struct rcu_reader *reader)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
/*
* Most readers don't need to be accounted for because their execution
* doesn't overlap with a grace period. If a reader is preempted however,
* it must be accounted in case a grace period starts while the reader
* is preempted. Accounting also occurs when a grace period starts, and
* more exactly, when the reader window ID of a processor is flipped.
*/
rcu_reader_account(reader, rcu_get_cpu_data());
}
void
rcu_report_periodic_event(void)
{
assert(!cpu_intr_enabled());
assert(!thread_preempt_enabled());
rcu_cpu_data_check_gp_state(rcu_get_cpu_data());
}
void
rcu_defer(struct work *work)
{
struct rcu_cpu_data *cpu_data;
unsigned long flags;
thread_preempt_disable_intr_save(&flags);
cpu_data = rcu_get_cpu_data();
rcu_cpu_data_queue(cpu_data, work);
thread_preempt_enable_intr_restore(flags);
}
static void
rcu_waiter_wakeup(struct work *work)
{
struct rcu_waiter *waiter;
waiter = structof(work, struct rcu_waiter, work);
spinlock_lock(&waiter->lock);
waiter->done = true;
thread_wakeup(waiter->thread);
spinlock_unlock(&waiter->lock);
}
static void
rcu_waiter_init(struct rcu_waiter *waiter, struct thread *thread)
{
work_init(&waiter->work, rcu_waiter_wakeup);
spinlock_init(&waiter->lock);
waiter->thread = thread;
waiter->done = false;
}
static void
rcu_waiter_wait(struct rcu_waiter *waiter)
{
rcu_defer(&waiter->work);
spinlock_lock(&waiter->lock);
while (!waiter->done) {
thread_sleep(&waiter->lock, waiter, "rcu_wait");
}
spinlock_unlock(&waiter->lock);
}
void
rcu_wait(void)
{
struct rcu_waiter waiter;
rcu_waiter_init(&waiter, thread_self()),
rcu_waiter_wait(&waiter);
}
static int __init
rcu_bootstrap(void)
{
rcu_data_init(&rcu_data);
rcu_cpu_data_init(cpu_local_ptr(rcu_cpu_data), 0);
return 0;
}
INIT_OP_DEFINE(rcu_bootstrap,
INIT_OP_DEP(spinlock_setup, true),
INIT_OP_DEP(sref_bootstrap, true),
INIT_OP_DEP(syscnt_setup, true),
INIT_OP_DEP(thread_bootstrap, true),
INIT_OP_DEP(timer_bootstrap, true));
static int __init
rcu_setup(void)
{
for (unsigned int i = 1; i < cpu_count(); i++) {
rcu_cpu_data_init(percpu_ptr(rcu_cpu_data, i), i);
}
return 0;
}
INIT_OP_DEFINE(rcu_setup,
INIT_OP_DEP(cpu_mp_probe, true),
INIT_OP_DEP(rcu_bootstrap, true));