2 files changed, 52 insertions, 16 deletions
diff --git a/kern/llsync.c b/kern/llsync.c
index 7c3e1c69..c6e41c06 100644
--- a/kern/llsync.c
+++ b/kern/llsync.c
@@ -75,6 +75,8 @@ struct llsync_waiter {
 void __init
 llsync_setup(void)
 {
+    unsigned int i;
+
     spinlock_init(&llsync_data.lock);
     work_queue_init(&llsync_data.queue0);
     work_queue_init(&llsync_data.queue1);
@@ -85,6 +87,9 @@ llsync_setup(void)
     evcnt_register(&llsync_data.ev_failed_periodic_checkin,
                    "llsync_failed_periodic_checkin");
     llsync_data.gcid.value = LLSYNC_INITIAL_GCID;
+
+    for (i = 0; i < ARRAY_SIZE(llsync_cpu_data); i++)
+        work_queue_init(&llsync_cpu_data[i].queue0);
 }
 
 static void
@@ -96,6 +101,15 @@ llsync_process_global_checkpoint(void)
     assert(cpumap_find_first(&llsync_data.pending_checkpoints) == -1);
     assert(llsync_data.nr_pending_checkpoints == 0);
 
+    nr_works = work_queue_nr_works(&llsync_data.queue0)
+               + work_queue_nr_works(&llsync_data.queue1);
+
+    /* TODO Handle hysteresis */
+    if (!llsync_data.no_warning && (nr_works >= LLSYNC_NR_PENDING_WORKS_WARN)) {
+        llsync_data.no_warning = 1;
+        printk("llsync: warning: large number of pending works\n");
+    }
+
     if (llsync_data.nr_registered_cpus == 0) {
         work_queue_concat(&llsync_data.queue1, &llsync_data.queue0);
         work_queue_init(&llsync_data.queue0);
@@ -107,18 +121,25 @@ llsync_process_global_checkpoint(void)
     work_queue_transfer(&queue, &llsync_data.queue1);
     work_queue_transfer(&llsync_data.queue1, &llsync_data.queue0);
     work_queue_init(&llsync_data.queue0);
-    nr_works = work_queue_nr_works(&queue);
 
-    if (nr_works != 0) {
-        llsync_data.nr_pending_works -= nr_works;
+    if (work_queue_nr_works(&queue) != 0)
         work_queue_schedule(&queue, 0);
-    }
 
     llsync_data.gcid.value++;
     evcnt_inc(&llsync_data.ev_global_checkpoint);
 }
 
 static void
+llsync_flush_works(struct llsync_cpu_data *cpu_data)
+{
+    if (work_queue_nr_works(&cpu_data->queue0) == 0)
+        return;
+
+    work_queue_concat(&llsync_data.queue0, &cpu_data->queue0);
+    work_queue_init(&cpu_data->queue0);
+}
+
+static void
 llsync_commit_checkpoint(unsigned int cpu)
 {
     int pending;
@@ -148,6 +169,7 @@ llsync_register(void)
     spinlock_lock_intr_save(&llsync_data.lock, &flags);
 
     assert(!cpu_data->registered);
+    assert(work_queue_nr_works(&cpu_data->queue0) == 0);
     cpu_data->registered = 1;
     cpu_data->gcid = llsync_data.gcid.value;
 
@@ -176,6 +198,8 @@ llsync_unregister(void)
 
     spinlock_lock_intr_save(&llsync_data.lock, &flags);
 
+    llsync_flush_works(cpu_data);
+
     assert(cpu_data->registered);
     cpu_data->registered = 0;
 
@@ -205,11 +229,15 @@ llsync_report_periodic_event(void)
     cpu = cpu_id();
     cpu_data = llsync_get_cpu_data(cpu);
 
-    if (!cpu_data->registered)
+    if (!cpu_data->registered) {
+        assert(work_queue_nr_works(&cpu_data->queue0) == 0);
         return;
+    }
 
     spinlock_lock(&llsync_data.lock);
 
+    llsync_flush_works(cpu_data);
+
     gcid = llsync_data.gcid.value;
     assert((gcid - cpu_data->gcid) <= 1);
 
@@ -239,17 +267,15 @@ llsync_report_periodic_event(void)
 void
 llsync_defer(struct work *work)
 {
+    struct llsync_cpu_data *cpu_data;
     unsigned long flags;
 
-    spinlock_lock_intr_save(&llsync_data.lock, &flags);
-
-    work_queue_push(&llsync_data.queue0, work);
-    llsync_data.nr_pending_works++;
-
-    if (llsync_data.nr_pending_works == LLSYNC_NR_PENDING_WORKS_WARN)
-        printk("llsync: warning: large number of pending works\n");
-
-    spinlock_unlock_intr_restore(&llsync_data.lock, flags);
+    thread_preempt_disable();
+    cpu_intr_save(&flags);
+    cpu_data = llsync_get_cpu_data(cpu_id());
+    work_queue_push(&cpu_data->queue0, work);
+    cpu_intr_restore(flags);
+    thread_preempt_enable();
 }
 
 static void
diff --git a/kern/llsync_i.h b/kern/llsync_i.h
index f14c9c8d..a3329f44 100644
--- a/kern/llsync_i.h
+++ b/kern/llsync_i.h
@@ -42,9 +42,9 @@ struct llsync_data {
     unsigned int nr_registered_cpus;
     struct cpumap pending_checkpoints;
     unsigned int nr_pending_checkpoints;
+    int no_warning;
     struct work_queue queue0;
     struct work_queue queue1;
-    unsigned long nr_pending_works;
     struct evcnt ev_global_checkpoint;
     struct evcnt ev_periodic_checkin;
     struct evcnt ev_failed_periodic_checkin;
@@ -72,11 +72,19 @@ extern struct llsync_data llsync_data;
  * after which all the local copies become stale. Checking in synchronizes
  * the local copy of the global checkpoint ID.
  *
+ * When works are deferred, they are initially added to a processor-local
+ * queue. This queue is regularly flushed to the global data, an operation
+ * that occurs every time a processor may commit a checkpoint. The downside
+ * of this scalability optimization is that it introduces some additional
+ * latency for works that are added to a processor queue between a flush and
+ * a global checkpoint.
+ *
  * Interrupts and preemption must be disabled on access.
  */
 struct llsync_cpu_data {
     int registered;
     unsigned int gcid;
+    struct work_queue queue0;
 } __aligned(CPU_L1_SIZE);
 
 extern struct llsync_cpu_data llsync_cpu_data[MAX_CPUS];
@@ -99,8 +107,10 @@ llsync_checkin(void)
     cpu = cpu_id();
     cpu_data = llsync_get_cpu_data(cpu);
 
-    if (!cpu_data->registered)
+    if (!cpu_data->registered) {
+        assert(work_queue_nr_works(&cpu_data->queue0) == 0);
         return;
+    }
 
     /*
      * The global checkpoint ID obtained might be obsolete here, in which