5 files changed, 118 insertions, 90 deletions
diff --git a/arch/x86/machine/pmu_amd.c b/arch/x86/machine/pmu_amd.c
index 34df620..8e56bfa 100644
--- a/arch/x86/machine/pmu_amd.c
+++ b/arch/x86/machine/pmu_amd.c
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include <kern/init.h>
+#include <kern/clock.h>
 #include <kern/log.h>
 #include <kern/perfmon.h>
 #include <machine/cpu.h>
@@ -175,7 +176,6 @@ pmu_amd_start(unsigned int pmc_id, unsigned int raw_event_id)
     /* TODO Handle PERFMON_EF_KERN/PERFMON_EF_USER */
     high = code->event_select >> 8;
     low = PMU_AMD_EVTSEL_EN
-          | PMU_AMD_EVTSEL_INT
           | PMU_AMD_EVTSEL_OS
           | PMU_AMD_EVTSEL_USR
           | (code->umask << 8)
@@ -205,38 +205,6 @@ pmu_amd_write(unsigned int pmc_id, uint64_t value)
     cpu_set_msr64(PMU_AMD_MSR_PERCTR0 + pmc_id, value);
 }
 
-/*
- * TODO Make the perfmon module handle basic overflow handling by polling
- * counters.
- */
-static void
-pmu_amd_handle_of_intr_v1(void)
-{
-    struct pmu_amd *pmu;
-    uint64_t value, prev;
-    unsigned int mask;
-
-    pmu = pmu_amd_get();
-
-    for (unsigned int pmc_id = 0; pmc_id != PMU_AMD_NR_PMCS; pmc_id++) {
-        mask = (1U << pmc_id);
-
-        if (pmu->pmc_bm & mask) {
-            continue;
-        }
-
-        value = pmu_amd_read(pmc_id);
-        prev = perfmon_cpu_pmc_get_prev(pmc_id);
-
-        if (prev > value) {
-            /* Overflow */
-            perfmon_cpu_pmc_inc_of(pmc_id);
-            /* Prevents us from overflowing twice */
-            perfmon_cpu_pmc_set_prev(pmc_id, value);
-        }
-    }
-}
-
 static int __init
 pmu_amd_setup(void)
 {
@@ -259,7 +227,9 @@ pmu_amd_setup(void)
     pmu->pmc_bm = (1U << PMU_AMD_NR_PMCS) - 1;
 
     pmu_driver.pmc_width = PMU_AMD_PMC_WIDTH;
-    pmu_driver.of_max_ticks = 1UL << (pmu_driver.pmc_width - 1);
+    /* Set max_tick to half the number of instruction per seconds. */
+    pmu_driver.of_max_ticks =
+        (1UL << (pmu_driver.pmc_width - 1)) / (cpu_get_freq() / CLOCK_FREQ);
 
     pmu_driver.ops.info = pmu_amd_info;
     pmu_driver.ops.translate = pmu_amd_translate;
diff --git a/arch/x86/machine/pmu_intel.c b/arch/x86/machine/pmu_intel.c
index ccc2294..e5d9ef7 100644
--- a/arch/x86/machine/pmu_intel.c
+++ b/arch/x86/machine/pmu_intel.c
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include <kern/init.h>
+#include <kern/clock.h>
 #include <kern/log.h>
 #include <kern/perfmon.h>
 #include <machine/cpu.h>
@@ -174,7 +175,7 @@ pmu_intel_info(void)
 
     pmu = pmu_intel_get();
     nr_events = pmu_popcount(pmu->events);
-    log_info("pmu: driver: intel, architectural v%d\n"
+    log_info("pmu: driver: intel, architectural v%d "
              "pmu: nr_pmcs: %u, pmc_width: %u, events: %#x, nr_events: %u\n",
              pmu->version, pmu->nr_pmcs, pmu->pmc_width, pmu->events,
              nr_events);
@@ -233,19 +234,23 @@ static void
 pmu_intel_start(unsigned int pmc_id, unsigned int raw_event_id)
 {
     const struct pmu_intel_event_code *code;
+    struct pmu_intel *pmu;
     uint32_t evtsel;
 
     assert(raw_event_id < ARRAY_SIZE(pmu_intel_event_codes));
 
     code = &pmu_intel_event_codes[raw_event_id];
+    pmu = pmu_intel_get();
 
     /* TODO Handle PERFMON_EF_KERN/PERFMON_EF_USER */
     evtsel = PMU_INTEL_EVTSEL_EN
              | PMU_INTEL_EVTSEL_OS
              | PMU_INTEL_EVTSEL_USR
-             | PMU_INTEL_EVTSEL_INT
              | (code->umask << 8)
              | code->event_select;
+    if (pmu->version >= 2) {
+        evtsel |= PMU_INTEL_EVTSEL_INT;
+    }
     cpu_set_msr(PMU_INTEL_MSR_EVTSEL0 + pmc_id, 0, evtsel);
 }
 
@@ -267,38 +272,6 @@ pmu_intel_write(unsigned int pmc_id, uint64_t value)
     cpu_set_msr64(PMU_INTEL_MSR_PMC0 + pmc_id, value);
 }
 
-/*
- * TODO Make the perfmon module handle basic overflow handling by polling
- * counters.
- */
-static void
-pmu_intel_handle_of_intr_v1(void)
-{
-    struct pmu_intel *pmu;
-    unsigned int mask;
-    uint64_t value;
-    uint64_t prev;
-
-    pmu = pmu_intel_get();
-
-    for (unsigned int pmc_id = 0; pmc_id != pmu->nr_pmcs; pmc_id++) {
-        mask = (1U << pmc_id);
-        if (pmu->pmc_bm & mask) {
-            /* counter not enabled: can't overflow. */
-            continue;
-        }
-
-        value = pmu_intel_read(pmc_id);
-        prev = perfmon_cpu_pmc_get_prev(pmc_id);
-        if (prev > value) {
-            /* Overflow */
-            perfmon_cpu_pmc_inc_of(pmc_id);
-            /* Prevents us from overflowing twice */
-            perfmon_cpu_pmc_set_prev(pmc_id, value);
-        }
-    }
-}
-
 static int
 pmu_intel_consume_bits(uint64_t *bits)
 {
@@ -330,7 +303,7 @@ pmu_intel_handle_of_intr_v2(void)
     pmu_intel_ack_status(status);
     pmu = pmu_intel_get();
 
-    status &= ((1U << pmu->pmc_width) - 1);
+    status &= ((1ULL << pmu->pmc_width) - 1);
 
     for (;;) {
         pmc_id = pmu_intel_consume_bits(&status);
@@ -366,7 +339,6 @@ pmu_intel_setup(void)
     cpu_cpuid(&eax, &ebx, &ecx, &edx);
     pmu->version = eax & PMU_INTEL_ID_VERSION_MASK;
 
-    /* TODO Check this */
     if (pmu->version == 0) {
         return ENODEV;
     }
@@ -396,7 +368,10 @@ pmu_intel_setup(void)
         pmu_driver.ops.handle_of_intr = pmu_intel_handle_of_intr_v2;
         pmu_driver.of_max_ticks = 0;
     } else {
-        pmu_driver.of_max_ticks = 1UL << (pmu_driver.pmc_width - 1);
+        /* Set max_tick to half the number of instruction per seconds. */
+        pmu_driver.ops.handle_of_intr = NULL;
+        pmu_driver.of_max_ticks =
+            (1ULL << (pmu_driver.pmc_width - 1)) / (cpu_get_freq() / CLOCK_FREQ);
     }
 
     return perfmon_pmu_register(&pmu_driver);
diff --git a/arch/x86/machine/trap.c b/arch/x86/machine/trap.c
index 101adf8..a7a5cbd 100644
--- a/arch/x86/machine/trap.c
+++ b/arch/x86/machine/trap.c
@@ -214,6 +214,7 @@ trap_setup(void)
     trap_install(TRAP_LAPIC_TIMER, TRAP_HF_INTR, lapic_timer_intr);
     trap_install(TRAP_LAPIC_ERROR, TRAP_HF_INTR, lapic_error_intr);
     trap_install(TRAP_LAPIC_SPURIOUS, TRAP_HF_INTR, lapic_spurious_intr);
+    trap_install(TRAP_LAPIC_PMC_OF, TRAP_HF_INTR, lapic_pmc_of_intr);
 
     return 0;
 }
diff --git a/kern/perfmon.c b/kern/perfmon.c
index 17175ca..0211141 100644
--- a/kern/perfmon.c
+++ b/kern/perfmon.c
@@ -48,10 +48,10 @@
 #include <kern/perfmon_i.h>
 #include <kern/spinlock.h>
 #include <kern/thread.h>
+#include <kern/timer.h>
 #include <kern/xcall.h>
 #include <machine/cpu.h>
 #include <machine/pmu.h>
-#include <machine/trap.h>
 
 /*
  * Performance monitoring event.
@@ -394,10 +394,23 @@ perfmon_grouplist_destroy(struct perfmon_grouplist *grouplist)
     kmem_cache_free(&perfmon_grouplist_cache, grouplist);
 }
 
+static void perfmon_check_of(struct timer *timer);
+
 static void __init
-perfmon_cpu_pmu_init(struct perfmon_cpu_pmu *cpu_pmu)
+perfmon_cpu_pmu_init(unsigned int cpuid)
 {
     unsigned int i;
+    struct perfmon_cpu_pmu *cpu_pmu;
+
+    cpu_pmu = percpu_ptr(perfmon_cpu_pmu, cpuid);
+    cpu_pmu->cpu_id = cpuid;
+    if (!pmu_driver.ops.handle_of_intr) {
+        /* XXX: using high prio instead or INTR because we might xcall from the
+         * callbacks.
+         */
+        timer_init(&cpu_pmu->of_timer, &perfmon_check_of, TIMER_HIGH_PRIO);
+        timer_schedule(&cpu_pmu->of_timer, pmu_driver.of_max_ticks);
+    }
 
     for (i = 0; i < ARRAY_SIZE(cpu_pmu->pmcs); i++) {
         struct perfmon_cpu_pmc *pmc;
@@ -405,7 +418,6 @@ perfmon_cpu_pmu_init(struct perfmon_cpu_pmu *cpu_pmu)
         pmc = &cpu_pmu->pmcs[i];
 
         pmc->nr_refs = 0;
-        pmc->prev_value = pmu_driver.ops.read(perfmon_pmu.pmcs[i].id);
         pmc->overflow_id = 0;
 
     }
@@ -462,6 +474,55 @@ perfmon_cpu_pmc_inc_of(unsigned int pmc_id)
 }
 
 static void
+perfmon_check_of_remote(void *arg)
+{
+    perfmon_check_of(arg);
+}
+
+static void
+perfmon_check_pmc_of(struct perfmon_cpu_pmc *cpu_pmc, uint64_t value)
+{
+    uint64_t prev;
+
+    prev = cpu_pmc->prev_value;
+    if (prev > value) {
+        /* Overflow */
+        cpu_pmc->overflow_id++;
+    }
+    cpu_pmc->prev_value = value;
+}
+
+static void
+perfmon_check_of(struct timer *timer)
+{
+    struct perfmon_pmc *pmc;
+    struct perfmon_cpu_pmc *cpu_pmc;
+    struct perfmon_cpu_pmu *cpu_pmu;
+    uint64_t value;
+
+    cpu_pmu = structof(timer, struct perfmon_cpu_pmu, of_timer);
+    if (cpu_pmu->cpu_id != cpu_id())
+    {
+        xcall_call(perfmon_check_of_remote, timer, cpu_pmu->cpu_id);
+        return;
+    }
+
+    for (size_t i = 0; i < ARRAY_SIZE(perfmon_pmu.pmcs); i++) {
+        pmc = perfmon_pmc_from_index(i);
+        if (pmc->nr_refs == 0) {
+            continue;
+        }
+
+        cpu_pmc = &cpu_pmu->pmcs[i];
+        value = pmu_driver.ops.read(pmc->id);
+
+        perfmon_check_pmc_of(cpu_pmc, value);
+    }
+
+    timer_schedule(timer, pmu_driver.of_max_ticks);
+}
+
+static void
 perfmon_cpu_pmu_load(struct perfmon_cpu_pmu *cpu_pmu, unsigned int pmc_index)
 {
     struct perfmon_cpu_pmc *cpu_pmc;
@@ -471,6 +532,7 @@ perfmon_cpu_pmu_load(struct perfmon_cpu_pmu *cpu_pmu, unsigned int pmc_index)
     if (cpu_pmc->nr_refs == 0) {
         pmu_driver.ops.start(perfmon_pmu.pmcs[pmc_index].id,
                              perfmon_pmu.pmcs[pmc_index].raw_event_id);
+        cpu_pmc->prev_value = pmu_driver.ops.read(perfmon_pmu.pmcs[pmc_index].id);
     }
 
     cpu_pmc->nr_refs++;
@@ -493,6 +555,7 @@ perfmon_cpu_pmu_unload(struct perfmon_cpu_pmu *cpu_pmu, unsigned int pmc_index)
 void
 perfmon_of_intr(void)
 {
+    assert(pmu_driver.ops.handle_of_intr);
     pmu_driver.ops.handle_of_intr();
 }
 
@@ -545,7 +608,7 @@ perfmon_setup(void)
     }
 
     for (i = 0; i < cpu_count(); i++) {
-        perfmon_cpu_pmu_init(percpu_ptr(perfmon_cpu_pmu, i));
+        perfmon_cpu_pmu_init(i);
     }
 
     for (i = 0; i < cpu_count(); i++) {
@@ -563,10 +626,6 @@ perfmon_setup(void)
         return ENODEV;
     }
     pmu_driver.ops.info();
-    if (pmu_driver.ops.handle_of_intr) {
-        /* FIXME: this should not require an architectural api call. */
-        trap_register(TRAP_LAPIC_PMC_OF, lapic_pmc_of_intr);
-    }
 
     return 0;
 }
@@ -671,12 +730,16 @@ perfmon_event_sync(struct perfmon_cpu_pmu *cpu_pmu,
     cpu_pmc = &cpu_pmu->pmcs[event->pmc_index];
     count = pmu_driver.ops.read(pmc->id);
 
+    if (!pmu_driver.ops.handle_of_intr) {
+        /* Force pmc overflow status update */
+        perfmon_check_pmc_of(cpu_pmc, count);
+    }
+
     if (unlikely(event->overflow_id != cpu_pmc->overflow_id)) {
         assert(cpu_pmc->overflow_id > event->overflow_id);
-
         diff = cpu_pmc->overflow_id > event->overflow_id;
         /* diff is very likely 1. */
-        event->count += (1UL <<  pmu_driver.pmc_width) * diff
+        event->count += (1ULL <<  pmu_driver.pmc_width) * diff
                         - event->prev + count;
         event->overflow_id = cpu_pmc->overflow_id;
     } else {
@@ -942,9 +1005,8 @@ perfmon_group_load(struct perfmon_group *group)
 {
     struct perfmon_cpu_pmu *cpu_pmu;
     struct perfmon_event *event;
-#ifdef CONFIG_PERFMON_TEST
     struct perfmon_pmc *pmc;
-#endif
+    uint64_t prev;
 
     assert(!thread_preempt_enabled());
     assert(perfmon_group_enabled(group));
@@ -967,8 +1029,11 @@ perfmon_group_load(struct perfmon_group *group)
 #endif
 
     list_for_each_entry(&group->events, event, node) {
+        pmc = perfmon_pmc_from_index(event->pmc_index);
+        prev = pmu_driver.ops.read(pmc->id);
+
         perfmon_cpu_pmu_load(cpu_pmu, event->pmc_index);
-        event->prev = pmu_driver.ops.read(perfmon_pmu.pmcs[event->pmc_index].id);
+        event->prev = prev;
         event->overflow_id = cpu_pmu->pmcs[event->pmc_index].overflow_id;
     }
 
diff --git a/test/test_perfmon_cpu.c b/test/test_perfmon_cpu.c
index 8ecb241..6f1414c 100644
--- a/test/test_perfmon_cpu.c
+++ b/test/test_perfmon_cpu.c
@@ -49,17 +49,30 @@ test_report_event(const struct perfmon_event *event, const char *name)
     printf("test: %s: %llu\n", name, count);
 }
 
+static uint64_t
+test_get_pre_overflow_value(uint64_t value)
+{
+    uint64_t pmc_max;
+    unsigned int pmc_width;
+
+    pmc_width = perfmon_get_pmc_width();
+    pmc_max = (1ULL << pmc_width) - 1;
+    pmc_max &= 0xffffffff80000000;
+
+    /* XXX: workaround most processor not allowing full width writes */
+    return ((~value + 1) & 0x7fffffff) | pmc_max;
+}
+
 static void
 test_run(void *arg)
 {
     struct perfmon_event *ev_cycle, *ev_instruction;
     struct perfmon_group *group;
     int error;
-    uint64_t pmc_max;
+    uint64_t value;
 
     (void)arg;
 
-    pmc_max = (1 << perfmon_get_pmc_width()) - 1;
     error = perfmon_group_create(&group);
     error_check(error, "perfmon_group_create");
 
@@ -86,10 +99,14 @@ test_run(void *arg)
     test_report_event(ev_instruction, "instruction");
 
     printf("checking with overflow ...\n");
-    /* TODO: choose value depending of architecture */
-    perfmon_event_write(ev_cycle, pmc_max - perfmon_event_read(ev_cycle) / 2);
-    perfmon_event_write(ev_instruction,
-                        pmc_max - perfmon_event_read(ev_instruction) / 3);
+    value = test_get_pre_overflow_value( perfmon_event_read(ev_cycle) / 2);
+    error = perfmon_event_write(ev_cycle, value);
+    error_check(error, "perfmon_event_write");
+
+    value = test_get_pre_overflow_value(perfmon_event_read(ev_instruction) / 3);
+    error = perfmon_event_write(ev_instruction, value);
+    error_check(error, "perfmon_event_write");
+
     perfmon_event_reset(ev_cycle);
     perfmon_event_reset(ev_instruction);