10 files changed, 324 insertions, 213 deletions
diff --git a/arch/x86/machine/cpu.c b/arch/x86/machine/cpu.c
index 8d680d8f..18686a38 100644
--- a/arch/x86/machine/cpu.c
+++ b/arch/x86/machine/cpu.c
@@ -17,7 +17,6 @@
 
 #include <kern/assert.h>
 #include <kern/init.h>
-#include <kern/llsync.h>
 #include <kern/macros.h>
 #include <kern/panic.h>
 #include <kern/param.h>
@@ -608,13 +607,3 @@ cpu_thread_schedule_intr(struct trap_frame *frame)
 
     thread_schedule_intr();
 }
-
-void
-cpu_llsync_reset_intr(struct trap_frame *frame)
-{
-    (void)frame;
-
-    lapic_eoi();
-
-    llsync_reset_checkpoint(cpu_id());
-}
diff --git a/arch/x86/machine/cpu.h b/arch/x86/machine/cpu.h
index f1e9cfb6..07dd98db 100644
--- a/arch/x86/machine/cpu.h
+++ b/arch/x86/machine/cpu.h
@@ -658,20 +658,6 @@ cpu_send_thread_schedule(unsigned int cpu)
  */
 void cpu_thread_schedule_intr(struct trap_frame *frame);
 
-/*
- * Request a remote processor to reset its checkpoint.
- */
-static inline void
-cpu_send_llsync_reset(unsigned int cpu)
-{
-    lapic_ipi_send(cpu_from_id(cpu)->apic_id, TRAP_LLSYNC_RESET);
-}
-
-/*
- * Interrupt handler for checkpoint reset requests.
- */
-void cpu_llsync_reset_intr(struct trap_frame *frame);
-
 #endif /* __ASSEMBLER__ */
 
 #endif /* _X86_CPU_H */
diff --git a/arch/x86/machine/trap.c b/arch/x86/machine/trap.c
index ad586ea9..8e350f00 100644
--- a/arch/x86/machine/trap.c
+++ b/arch/x86/machine/trap.c
@@ -75,7 +75,6 @@ void trap_isr_machine_check(void);
 void trap_isr_simd_fp_exception(void);
 void trap_isr_pic_int7(void);
 void trap_isr_pic_int15(void);
-void trap_isr_llsync_reset(void);
 void trap_isr_thread_schedule(void);
 void trap_isr_cpu_halt(void);
 void trap_isr_lapic_timer(void);
@@ -202,8 +201,6 @@ trap_setup(void)
                  trap_isr_pic_int15, pic_intr_spurious);
 
     /* System defined traps */
-    trap_install(TRAP_LLSYNC_RESET, TRAP_HF_NOPREEMPT,
-                 trap_isr_llsync_reset, cpu_llsync_reset_intr);
     trap_install(TRAP_THREAD_SCHEDULE, TRAP_HF_NOPREEMPT,
                  trap_isr_thread_schedule, cpu_thread_schedule_intr);
     trap_install(TRAP_CPU_HALT, TRAP_HF_NOPREEMPT,
diff --git a/arch/x86/machine/trap.h b/arch/x86/machine/trap.h
index bc0426d0..99e23896 100644
--- a/arch/x86/machine/trap.h
+++ b/arch/x86/machine/trap.h
@@ -53,7 +53,6 @@
  *
  * The local APIC assigns one priority every 16 vectors.
  */
-#define TRAP_LLSYNC_RESET       238
 #define TRAP_THREAD_SCHEDULE    239
 #define TRAP_CPU_HALT           240
 #define TRAP_LAPIC_TIMER        253
diff --git a/arch/x86/machine/trap_asm.S b/arch/x86/machine/trap_asm.S
index 7d06d9dd..bb70f46e 100644
--- a/arch/x86/machine/trap_asm.S
+++ b/arch/x86/machine/trap_asm.S
@@ -157,7 +157,6 @@ TRAP(TRAP_PIC_BASE + 7, trap_isr_pic_int7)
 TRAP(TRAP_PIC_BASE + 15, trap_isr_pic_int15)
 
 /* System defined traps */
-TRAP(TRAP_LLSYNC_RESET, trap_isr_llsync_reset)
 TRAP(TRAP_THREAD_SCHEDULE, trap_isr_thread_schedule)
 TRAP(TRAP_CPU_HALT, trap_isr_cpu_halt)
 TRAP(TRAP_LAPIC_TIMER, trap_isr_lapic_timer)
diff --git a/kern/llsync.c b/kern/llsync.c
index ba99a9b0..7c3e1c69 100644
--- a/kern/llsync.c
+++ b/kern/llsync.c
@@ -50,50 +50,20 @@
 #include <kern/work.h>
 #include <machine/cpu.h>
 
-#define LLSYNC_NR_PENDING_WORKS_WARN 10000
-
-struct llsync_cpu llsync_cpus[MAX_CPUS];
-
-/*
- * Global lock protecting the remaining module data.
- *
- * Interrupts must be disabled when acquiring this lock.
- */
-static struct spinlock llsync_lock;
-
-/*
- * Map of processors regularly checking in.
- */
-static struct cpumap llsync_registered_cpus;
-static unsigned int llsync_nr_registered_cpus;
-
 /*
- * Map of processors for which a checkpoint commit is pending.
+ * Initial global checkpoint ID.
  *
- * To reduce contention, checking in only affects a single per-processor
- * cache line. Special events (currently the system timer interrupt only)
- * trigger checkpoint commits, which report the local state to this CPU
- * map, thereby acquiring the global lock.
+ * Set to a high value to make sure overflows are correctly handled.
  */
-static struct cpumap llsync_pending_checkpoints;
-static unsigned int llsync_nr_pending_checkpoints;
+#define LLSYNC_INITIAL_GCID ((unsigned int)-10)
 
 /*
- * Queues of deferred works.
- *
- * The queue number matches the number of global checkpoints that occurred
- * since works contained in it were added. After two global checkpoints,
- * works are scheduled for processing.
+ * Number of pending works beyond which to issue a warning.
  */
-static struct work_queue llsync_queue0;
-static struct work_queue llsync_queue1;
+#define LLSYNC_NR_PENDING_WORKS_WARN 10000
 
-/*
- * Number of works not yet scheduled for processing.
- *
- * Mostly unused, except for debugging.
- */
-static unsigned long llsync_nr_pending_works;
+struct llsync_data llsync_data;
+struct llsync_cpu_data llsync_cpu_data[MAX_CPUS];
 
 struct llsync_waiter {
     struct work work;
@@ -105,161 +75,165 @@ struct llsync_waiter {
 void __init
 llsync_setup(void)
 {
-    char name[EVCNT_NAME_SIZE];
-    unsigned int cpu;
-
-    spinlock_init(&llsync_lock);
-    work_queue_init(&llsync_queue0);
-    work_queue_init(&llsync_queue1);
-
-    for (cpu = 0; cpu < cpu_count(); cpu++) {
-        snprintf(name, sizeof(name), "llsync_reset/%u", cpu);
-        evcnt_register(&llsync_cpus[cpu].ev_reset, name);
-        snprintf(name, sizeof(name), "llsync_spurious_reset/%u", cpu);
-        evcnt_register(&llsync_cpus[cpu].ev_spurious_reset, name);
-    }
-}
-
-static void
-llsync_reset_checkpoint_common(unsigned int cpu)
-{
-    assert(!cpumap_test(&llsync_pending_checkpoints, cpu));
-    cpumap_set(&llsync_pending_checkpoints, cpu);
-    llsync_cpus[cpu].checked = 0;
+    spinlock_init(&llsync_data.lock);
+    work_queue_init(&llsync_data.queue0);
+    work_queue_init(&llsync_data.queue1);
+    evcnt_register(&llsync_data.ev_global_checkpoint,
+                   "llsync_global_checkpoint");
+    evcnt_register(&llsync_data.ev_periodic_checkin,
+                   "llsync_periodic_checkin");
+    evcnt_register(&llsync_data.ev_failed_periodic_checkin,
+                   "llsync_failed_periodic_checkin");
+    llsync_data.gcid.value = LLSYNC_INITIAL_GCID;
 }
 
 static void
-llsync_process_global_checkpoint(unsigned int cpu)
+llsync_process_global_checkpoint(void)
 {
     struct work_queue queue;
     unsigned int nr_works;
-    int i;
 
-    if (llsync_nr_registered_cpus == 0) {
-        work_queue_concat(&llsync_queue1, &llsync_queue0);
-        work_queue_init(&llsync_queue0);
-    }
+    assert(cpumap_find_first(&llsync_data.pending_checkpoints) == -1);
+    assert(llsync_data.nr_pending_checkpoints == 0);
 
-    work_queue_transfer(&queue, &llsync_queue1);
-    work_queue_transfer(&llsync_queue1, &llsync_queue0);
-    work_queue_init(&llsync_queue0);
-
-    llsync_nr_pending_checkpoints = llsync_nr_registered_cpus;
-
-    if (llsync_cpus[cpu].registered)
-        llsync_reset_checkpoint_common(cpu);
-
-    cpumap_for_each(&llsync_registered_cpus, i)
-        if ((unsigned int)i != cpu)
-            cpu_send_llsync_reset(i);
+    if (llsync_data.nr_registered_cpus == 0) {
+        work_queue_concat(&llsync_data.queue1, &llsync_data.queue0);
+        work_queue_init(&llsync_data.queue0);
+    } else {
+        cpumap_copy(&llsync_data.pending_checkpoints, &llsync_data.registered_cpus);
+        llsync_data.nr_pending_checkpoints = llsync_data.nr_registered_cpus;
+    }
 
+    work_queue_transfer(&queue, &llsync_data.queue1);
+    work_queue_transfer(&llsync_data.queue1, &llsync_data.queue0);
+    work_queue_init(&llsync_data.queue0);
     nr_works = work_queue_nr_works(&queue);
 
     if (nr_works != 0) {
-        llsync_nr_pending_works -= nr_works;
+        llsync_data.nr_pending_works -= nr_works;
         work_queue_schedule(&queue, 0);
     }
+
+    llsync_data.gcid.value++;
+    evcnt_inc(&llsync_data.ev_global_checkpoint);
 }
 
 static void
-llsync_commit_checkpoint_common(unsigned int cpu)
+llsync_commit_checkpoint(unsigned int cpu)
 {
     int pending;
 
-    pending = cpumap_test(&llsync_pending_checkpoints, cpu);
+    pending = cpumap_test(&llsync_data.pending_checkpoints, cpu);
 
     if (!pending)
         return;
 
-    cpumap_clear(&llsync_pending_checkpoints, cpu);
-    llsync_nr_pending_checkpoints--;
+    cpumap_clear(&llsync_data.pending_checkpoints, cpu);
+    llsync_data.nr_pending_checkpoints--;
 
-    if (llsync_nr_pending_checkpoints == 0)
-        llsync_process_global_checkpoint(cpu);
+    if (llsync_data.nr_pending_checkpoints == 0)
+        llsync_process_global_checkpoint();
 }
 
 void
-llsync_register_cpu(unsigned int cpu)
+llsync_register(void)
 {
+    struct llsync_cpu_data *cpu_data;
     unsigned long flags;
+    unsigned int cpu;
+
+    cpu = cpu_id();
+    cpu_data = llsync_get_cpu_data(cpu);
 
-    spinlock_lock_intr_save(&llsync_lock, &flags);
+    spinlock_lock_intr_save(&llsync_data.lock, &flags);
 
-    assert(!llsync_cpus[cpu].registered);
-    llsync_cpus[cpu].registered = 1;
+    assert(!cpu_data->registered);
+    cpu_data->registered = 1;
+    cpu_data->gcid = llsync_data.gcid.value;
 
-    assert(!cpumap_test(&llsync_registered_cpus, cpu));
-    cpumap_set(&llsync_registered_cpus, cpu);
-    llsync_nr_registered_cpus++;
+    assert(!cpumap_test(&llsync_data.registered_cpus, cpu));
+    cpumap_set(&llsync_data.registered_cpus, cpu);
+    llsync_data.nr_registered_cpus++;
 
-    assert(!cpumap_test(&llsync_pending_checkpoints, cpu));
+    assert(!cpumap_test(&llsync_data.pending_checkpoints, cpu));
 
-    if ((llsync_nr_registered_cpus == 1)
-        && (llsync_nr_pending_checkpoints == 0))
-        llsync_process_global_checkpoint(cpu);
+    if ((llsync_data.nr_registered_cpus == 1)
+        && (llsync_data.nr_pending_checkpoints == 0))
+        llsync_process_global_checkpoint();
 
-    spinlock_unlock_intr_restore(&llsync_lock, flags);
+    spinlock_unlock_intr_restore(&llsync_data.lock, flags);
 }
 
 void
-llsync_unregister_cpu(unsigned int cpu)
+llsync_unregister(void)
 {
+    struct llsync_cpu_data *cpu_data;
     unsigned long flags;
+    unsigned int cpu;
 
-    spinlock_lock_intr_save(&llsync_lock, &flags);
+    cpu = cpu_id();
+    cpu_data = llsync_get_cpu_data(cpu);
 
-    assert(llsync_cpus[cpu].registered);
-    llsync_cpus[cpu].registered = 0;
+    spinlock_lock_intr_save(&llsync_data.lock, &flags);
 
-    assert(cpumap_test(&llsync_registered_cpus, cpu));
-    cpumap_clear(&llsync_registered_cpus, cpu);
-    llsync_nr_registered_cpus--;
+    assert(cpu_data->registered);
+    cpu_data->registered = 0;
+
+    assert(cpumap_test(&llsync_data.registered_cpus, cpu));
+    cpumap_clear(&llsync_data.registered_cpus, cpu);
+    llsync_data.nr_registered_cpus--;
 
     /*
      * Processor registration qualifies as a checkpoint. Since unregistering
      * a processor also disables commits until it's registered again, perform
      * one now.
      */
-    llsync_commit_checkpoint_common(cpu);
+    llsync_commit_checkpoint(cpu);
 
-    spinlock_unlock_intr_restore(&llsync_lock, flags);
+    spinlock_unlock_intr_restore(&llsync_data.lock, flags);
 }
 
 void
-llsync_reset_checkpoint(unsigned int cpu)
+llsync_report_periodic_event(void)
 {
+    struct llsync_cpu_data *cpu_data;
+    unsigned int cpu, gcid;
+
     assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
 
-    spinlock_lock(&llsync_lock);
+    cpu = cpu_id();
+    cpu_data = llsync_get_cpu_data(cpu);
 
-    evcnt_inc(&llsync_cpus[cpu].ev_reset);
-    llsync_reset_checkpoint_common(cpu);
+    if (!cpu_data->registered)
+        return;
+
+    spinlock_lock(&llsync_data.lock);
+
+    gcid = llsync_data.gcid.value;
+    assert((gcid - cpu_data->gcid) <= 1);
 
     /*
-     * It may happen that this processor was registered at the time a global
-     * checkpoint occurred, but unregistered itself before receiving the reset
-     * interrupt. In this case, behave as if the reset request was received
-     * before unregistering by immediately committing the local checkpoint.
+     * If the local copy of the global checkpoint ID matches the true
+     * value, the current processor has checked in.
+     *
+     * Otherwise, there were no checkpoint since the last global checkpoint.
+     * Check whether this periodic event occurred during a read-side critical
+     * section, and if not, trigger a checkpoint.
      */
-    if (!llsync_cpus[cpu].registered) {
-        evcnt_inc(&llsync_cpus[cpu].ev_spurious_reset);
-        llsync_commit_checkpoint_common(cpu);
+    if (cpu_data->gcid == gcid)
+        llsync_commit_checkpoint(cpu);
+    else {
+        if (thread_llsync_in_read_cs())
+            evcnt_inc(&llsync_data.ev_failed_periodic_checkin);
+        else {
+            cpu_data->gcid = gcid;
+            evcnt_inc(&llsync_data.ev_periodic_checkin);
+            llsync_commit_checkpoint(cpu);
+        }
     }
 
-    spinlock_unlock(&llsync_lock);
-}
-
-void
-llsync_commit_checkpoint(unsigned int cpu)
-{
-    assert(!cpu_intr_enabled());
-
-    if (!(llsync_cpus[cpu].registered && llsync_cpus[cpu].checked))
-        return;
-
-    spinlock_lock(&llsync_lock);
-    llsync_commit_checkpoint_common(cpu);
-    spinlock_unlock(&llsync_lock);
+    spinlock_unlock(&llsync_data.lock);
 }
 
 void
@@ -267,15 +241,15 @@ llsync_defer(struct work *work)
 {
     unsigned long flags;
 
-    spinlock_lock_intr_save(&llsync_lock, &flags);
+    spinlock_lock_intr_save(&llsync_data.lock, &flags);
 
-    work_queue_push(&llsync_queue0, work);
-    llsync_nr_pending_works++;
+    work_queue_push(&llsync_data.queue0, work);
+    llsync_data.nr_pending_works++;
 
-    if (llsync_nr_pending_works == LLSYNC_NR_PENDING_WORKS_WARN)
+    if (llsync_data.nr_pending_works == LLSYNC_NR_PENDING_WORKS_WARN)
         printk("llsync: warning: large number of pending works\n");
 
-    spinlock_unlock_intr_restore(&llsync_lock, flags);
+    spinlock_unlock_intr_restore(&llsync_data.lock, flags);
 }
 
 static void
diff --git a/kern/llsync.h b/kern/llsync.h
index 1919b62a..0d7438bb 100644
--- a/kern/llsync.h
+++ b/kern/llsync.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Richard Braun.
+ * Copyright (c) 2013-2014 Richard Braun.
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,6 +16,55 @@
  *
  *
  * Lockless synchronization.
+ *
+ * The llsync module provides services similar to RCU (Read-Copy Update).
+ * As such, it can be thought of as an efficient reader-writer lock
+ * replacement. It is efficient because read-side critical sections
+ * don't use expensive synchronization mechanisms such as locks or atomic
+ * instructions. Lockless synchronization is therefore best used for
+ * read-mostly objects. Updating still requires conventional lock-based
+ * synchronization.
+ *
+ * The basic idea is that read-side critical sections are assumed to hold
+ * read-side references, and objects for which there may be read-side
+ * references must exist as long as such references may be held. The llsync
+ * module tracks special system events to determine when read-side references
+ * can no longer exist.
+ *
+ * Since read-side critical sections can run concurrently with updates,
+ * it is important to make sure that objects are consistent when being
+ * accessed. This is achieved with a publish/subscribe mechanism that relies
+ * on the natural atomicity of machine word updates in memory, i.e. all
+ * supported architectures must guarantee that, when updating a word, and
+ * in turn a pointer, other processors reading that word obtain a valid
+ * value, that is either the previous or the next value of the word, but not
+ * a mixed-up value. The llsync module provides the llsync_assign_ptr() and
+ * llsync_read_ptr() wrappers that take care of low level details such as
+ * compiler and memory barriers, so that objects are completely built and
+ * consistent when published and accessed.
+ *
+ * As objects are published through pointers, multiple versions can exist at
+ * the same time. Previous versions cannot be deleted as long as read-side
+ * references may exist. Operations that must wait for all read-side references
+ * to be dropped can be either synchronous, i.e. block until it is safe to
+ * proceed, or be deferred, in which case they are queued and later handed to
+ * the work module. As a result, special care must be taken if using lockless
+ * synchronization in the work module itself.
+ *
+ * The two system events tracked by the llsync module are context switches
+ * and a periodic event, normally the periodic timer interrupt that drives
+ * the scheduler. Context switches are used as checkpoint triggers. A
+ * checkpoint is a point in execution at which no read-side reference can
+ * exist, i.e. the processor isn't running any read-side critical section.
+ * Since context switches can be very frequent, a checkpoint is local to
+ * the processor and lightweight. The periodic event is used to commit
+ * checkpoints globally so that other processors are aware of the progress
+ * of one another. As the system allows situations in which two periodic
+ * events can occur without a single context switch, the periodic event is
+ * also used as a checkpoint trigger. When all checkpoints have been
+ * committed, a global checkpoint occurs. The occurrence of global checkpoints
+ * allows the llsync module to determine when it is safe to process deferred
+ * work or unblock update sides.
  */
 
 #ifndef _KERN_LLSYNC_H
@@ -30,10 +79,6 @@
 
 /*
  * Safely assign a pointer.
- *
- * This macro enforces memory ordering. It should be used to reference
- * objects once they're completely built, so that readers accessing the
- * pointer obtain consistent data.
  */
 #define llsync_assign_ptr(ptr, value)   \
 MACRO_BEGIN                             \
@@ -48,27 +93,31 @@ MACRO_END
  */
 #define llsync_read_ptr(ptr) (ptr)
 
+/*
+ * Read-side critical section enter/exit functions.
+ *
+ * It is not allowed to block inside a read-side critical section.
+ */
+
 static inline void
 llsync_read_enter(void)
 {
-    thread_preempt_disable();
+    int in_read_cs;
+
+    in_read_cs = thread_llsync_in_read_cs();
+    thread_llsync_read_inc();
+
+    if (!in_read_cs)
+        thread_preempt_disable();
 }
 
 static inline void
 llsync_read_exit(void)
 {
-    thread_preempt_enable();
-}
+    thread_llsync_read_dec();
 
-/*
- * Report that a processor has reached a checkpoint.
- *
- * Called during context switch.
- */
-static inline void
-llsync_checkin(unsigned int cpu)
-{
-    llsync_cpus[cpu].checked = 1;
+    if (!thread_llsync_in_read_cs())
+        thread_preempt_enable();
 }
 
 /*
@@ -77,34 +126,39 @@ llsync_checkin(unsigned int cpu)
 void llsync_setup(void);
 
 /*
- * Report that a processor will be regularly checking in.
+ * Manage registration of the current processor.
  *
- * Registered processors perform checkpoint commits and receive checkpoint
- * reset interrupts.
- */
-void llsync_register_cpu(unsigned int cpu);
-
-/*
- * Report that a processor has entered a state in which checking in becomes
- * irrelevant (e.g. the idle loop).
+ * The caller must not be allowed to migrate when calling these functions.
+ *
+ * Registering tells the llsync module that the current processor reports
+ * context switches and periodic events.
+ *
+ * When a processor enters a state in which checking in becomes irrelevant,
+ * it unregisters itself so that the other registered processors don't need
+ * to wait for it to make progress. For example, this is done inside the
+ * idle loop since it is obviously impossible to enter a read-side critical
+ * section while idling.
  */
-void llsync_unregister_cpu(unsigned int cpu);
+void llsync_register(void);
+void llsync_unregister(void);
 
 /*
- * Commit a pending checkpoint.
+ * Report a context switch on the current processor.
  *
- * Checking in is a light processor-local operation. Committing a checkpoint
- * is a heavier global one, and is performed less often, normally during the
- * system timer interrupt.
+ * Interrupts and preemption must be disabled when calling this function.
  */
-void llsync_commit_checkpoint(unsigned int cpu);
+static inline void
+llsync_report_context_switch(void)
+{
+    llsync_checkin();
+}
 
 /*
- * Reset the checkpoint pending state of a processor.
+ * Report a periodic event on the current processor.
  *
- * Called from interrupt context.
+ * Interrupts and preemption must be disabled when calling this function.
  */
-void llsync_reset_checkpoint(unsigned int cpu);
+void llsync_report_periodic_event(void);
 
 /*
  * Defer an operation until all existing read-side references are dropped,
diff --git a/kern/llsync_i.h b/kern/llsync_i.h
index 5dae472e..f14c9c8d 100644
--- a/kern/llsync_i.h
+++ b/kern/llsync_i.h
@@ -18,21 +18,98 @@
 #ifndef _KERN_LLSYNC_I_H
 #define _KERN_LLSYNC_I_H
 
+#include <kern/assert.h>
+#include <kern/cpumap.h>
 #include <kern/evcnt.h>
+#include <kern/macros.h>
 #include <kern/param.h>
+#include <kern/spinlock.h>
+#include <kern/work.h>
+#include <machine/cpu.h>
+
+/*
+ * Global data.
+ *
+ * The queue number matches the number of global checkpoints that occurred
+ * since works contained in it were added. After two global checkpoints,
+ * works are scheduled for processing.
+ *
+ * Interrupts must be disabled when acquiring the global data lock.
+ */
+struct llsync_data {
+    struct spinlock lock;
+    struct cpumap registered_cpus;
+    unsigned int nr_registered_cpus;
+    struct cpumap pending_checkpoints;
+    unsigned int nr_pending_checkpoints;
+    struct work_queue queue0;
+    struct work_queue queue1;
+    unsigned long nr_pending_works;
+    struct evcnt ev_global_checkpoint;
+    struct evcnt ev_periodic_checkin;
+    struct evcnt ev_failed_periodic_checkin;
+
+    /*
+     * Global checkpoint ID.
+     *
+     * This variable can be frequently accessed from many processors so :
+     *  - reserve a whole cache line for it
+     *  - apply optimistic accesses to reduce contention
+     */
+    struct {
+        volatile unsigned int value __aligned(CPU_L1_SIZE);
+    } gcid;
+};
+
+extern struct llsync_data llsync_data;
 
 /*
  * Per-processor data.
  *
- * Interrupts must be disabled on access.
+ * Every processor records whether it is registered and a local copy of the
+ * global checkpoint ID, which is meaningless on unregistered processors.
+ * The true global checkpoint ID is incremented when a global checkpoint occurs,
+ * after which all the local copies become stale. Checking in synchronizes
+ * the local copy of the global checkpoint ID.
+ *
+ * Interrupts and preemption must be disabled on access.
  */
-struct llsync_cpu {
+struct llsync_cpu_data {
     int registered;
-    int checked;
-    struct evcnt ev_reset;
-    struct evcnt ev_spurious_reset;
+    unsigned int gcid;
 } __aligned(CPU_L1_SIZE);
 
-extern struct llsync_cpu llsync_cpus[MAX_CPUS];
+extern struct llsync_cpu_data llsync_cpu_data[MAX_CPUS];
+
+static inline struct llsync_cpu_data *
+llsync_get_cpu_data(unsigned int cpu)
+{
+    return &llsync_cpu_data[cpu];
+}
+
+static inline void
+llsync_checkin(void)
+{
+    struct llsync_cpu_data *cpu_data;
+    unsigned int cpu, gcid;
+
+    assert(!cpu_intr_enabled());
+    assert(!thread_preempt_enabled());
+
+    cpu = cpu_id();
+    cpu_data = llsync_get_cpu_data(cpu);
+
+    if (!cpu_data->registered)
+        return;
+
+    /*
+     * The global checkpoint ID obtained might be obsolete here, in which
+     * case a commit will not determine that a checkpoint actually occurred.
+     * This should seldom happen.
+     */
+    gcid = llsync_data.gcid.value;
+    assert((gcid - cpu_data->gcid) <= 1);
+    cpu_data->gcid = gcid;
+}
 
 #endif /* _KERN_LLSYNC_I_H */
diff --git a/kern/thread.c b/kern/thread.c
index 60f57904..9ed55d3c 100644
--- a/kern/thread.c
+++ b/kern/thread.c
@@ -482,7 +482,7 @@ thread_runq_schedule(struct thread_runq *runq, struct thread *prev)
     assert(!cpu_intr_enabled());
     spinlock_assert_locked(&runq->lock);
 
-    llsync_checkin(thread_runq_id(runq));
+    llsync_report_context_switch();
 
     thread_clear_flag(prev, THREAD_YIELD);
     thread_runq_put_prev(runq, prev);
@@ -1463,6 +1463,7 @@ thread_init(struct thread *thread, void *stack, const struct thread_attr *attr,
     thread->state = THREAD_SLEEPING;
     thread->preempt = 2;
     thread->pinned = 0;
+    thread->llsync_read = 0;
     thread->sched_policy = attr->policy;
     thread->sched_class = thread_policy_table[attr->policy];
     cpumap_copy(&thread->cpumap, cpumap);
@@ -1679,14 +1680,14 @@ static void
 thread_idle(void *arg)
 {
     struct thread *self;
-    unsigned int cpu;
+
+    (void)arg;
 
     self = thread_self();
-    cpu = thread_runq_id(arg);
 
     for (;;) {
         thread_preempt_disable();
-        llsync_unregister_cpu(cpu);
+        llsync_unregister();
 
         for (;;) {
             cpu_intr_disable();
@@ -1699,7 +1700,7 @@ thread_idle(void *arg)
             cpu_idle();
         }
 
-        llsync_register_cpu(cpu);
+        llsync_register();
         thread_preempt_enable();
     }
 }
@@ -1735,7 +1736,7 @@ thread_setup_idler(struct thread_runq *runq)
     thread_attr_init(&attr, name);
     thread_attr_set_cpumap(&attr, cpumap);
     thread_attr_set_policy(&attr, THREAD_SCHED_POLICY_IDLE);
-    error = thread_init(idler, stack, &attr, thread_idle, runq);
+    error = thread_init(idler, stack, &attr, thread_idle, NULL);
 
     if (error)
         panic("thread: unable to initialize idler thread");
@@ -1946,7 +1947,7 @@ thread_run_scheduler(void)
     assert(!cpu_intr_enabled());
 
     runq = thread_runq_local();
-    llsync_register_cpu(thread_runq_id(runq));
+    llsync_register();
     thread = thread_self();
     assert(thread == runq->current);
     assert(thread->preempt == 1);
@@ -2003,7 +2004,7 @@ thread_tick_intr(void)
 
     runq = thread_runq_local();
     evcnt_inc(&runq->ev_tick);
-    llsync_commit_checkpoint(thread_runq_id(runq));
+    llsync_report_periodic_event();
     thread = thread_self();
 
     spinlock_lock(&runq->lock);
diff --git a/kern/thread.h b/kern/thread.h
index e570cd29..01d59e5f 100644
--- a/kern/thread.h
+++ b/kern/thread.h
@@ -154,6 +154,7 @@ struct thread {
     /* Thread-local members */
     unsigned short preempt;
     unsigned short pinned;
+    unsigned short llsync_read;
 
     /* Common scheduling properties */
     unsigned char sched_policy;
@@ -448,6 +449,40 @@ thread_preempt_disable(void)
 }
 
 /*
+ * Lockless synchronization read-side critical section nesting counter
+ * control functions.
+ */
+
+static inline int
+thread_llsync_in_read_cs(void)
+{
+    struct thread *thread;
+
+    thread = thread_self();
+    return (thread->llsync_read != 0);
+}
+
+static inline void
+thread_llsync_read_inc(void)
+{
+    struct thread *thread;
+
+    thread = thread_self();
+    thread->llsync_read++;
+    assert(thread->llsync_read != 0);
+}
+
+static inline void
+thread_llsync_read_dec(void)
+{
+    struct thread *thread;
+
+    thread = thread_self();
+    assert(thread->llsync_read != 0);
+    thread->llsync_read--;
+}
+
+/*
  * Type for thread-specific data destructor.
  */
 typedef void (*thread_dtor_fn_t)(void *);