diff options
author | Richard Braun <rbraun@sceen.net> | 2013-07-01 21:09:40 +0200 |
---|---|---|
committer | Richard Braun <rbraun@sceen.net> | 2013-07-01 21:09:40 +0200 |
commit | ca3910f342605ab91011e7d72e065b6758ce9df0 (patch) | |
tree | d0abddf4f6ac1069f570b378c912fb5cd2abd517 | |
parent | 75d1643bc6478538b0c8e13447fddd1f3b5b7f13 (diff) |
x86/pmap: improve TLB invalidation
Add a processor bitmap per physical map to determine processors on which
a pmap is loaded, so that only those processors receive update IPIs. In
addition, implement lazy TLB invalidation by not loading page tables
when switching to a kernel thread. To finish with, the thread module now
calls pmap_load unconditionally without making assumptions about pmap
optimizations.
-rw-r--r-- | arch/x86/machine/pmap.c | 89 | ||||
-rw-r--r-- | arch/x86/machine/pmap.h | 5 | ||||
-rw-r--r-- | kern/thread.c | 7 |
3 files changed, 75 insertions, 26 deletions
diff --git a/arch/x86/machine/pmap.c b/arch/x86/machine/pmap.c index 77544594..febe564c 100644 --- a/arch/x86/machine/pmap.c +++ b/arch/x86/machine/pmap.c @@ -16,6 +16,7 @@ */ #include <kern/assert.h> +#include <kern/cpumap.h> #include <kern/error.h> #include <kern/init.h> #include <kern/kmem.h> @@ -29,7 +30,6 @@ #include <kern/string.h> #include <kern/thread.h> #include <kern/types.h> -#include <machine/atomic.h> #include <machine/biosmem.h> #include <machine/boot.h> #include <machine/cpu.h> @@ -154,16 +154,20 @@ static struct { /* * TLB invalidation data. * - * TODO Implement generic inter-processor calls with low overhead and use them. + * TODO Use per processor sets of update data. */ +struct pmap_update_cpu_data { + int updated; +} __aligned(CPU_L1_SIZE); + static struct { + struct pmap_update_cpu_data cpu_datas[MAX_CPUS]; + struct spinlock lock; + struct cpumap cpumap; struct pmap *pmap; unsigned long start; unsigned long end; - - /* There may be strong bouncing on this counter so give it a cache line */ - volatile unsigned long nr_pending_updates __aligned(CPU_L1_SIZE); } pmap_update_data; /* @@ -359,6 +363,8 @@ pmap_bootstrap(void) unsigned int i; mutex_init(&kernel_pmap->lock); + cpumap_zero(&kernel_pmap->cpumap); + cpumap_set(&kernel_pmap->cpumap, 0); cpu_percpu_set_pmap(kernel_pmap); pmap_boot_heap = (unsigned long)&_end; @@ -403,6 +409,7 @@ pmap_bootstrap(void) void __init pmap_ap_bootstrap(void) { + cpumap_set(&kernel_pmap->cpumap, cpu_id()); cpu_percpu_set_pmap(kernel_pmap); if (cpu_has_global_pages()) @@ -662,34 +669,43 @@ pmap_update_local(struct pmap *pmap, unsigned long start, unsigned long end) void pmap_update(struct pmap *pmap, unsigned long start, unsigned long end) { - unsigned int nr_cpus; + unsigned int cpu; + int i; - nr_cpus = cpu_count(); - - assert(cpu_intr_enabled() || (nr_cpus == 1)); - - if (nr_cpus == 1) { + if (cpu_count() == 1) { pmap_update_local(pmap, start, end); return; } + assert(cpu_intr_enabled()); + spinlock_lock(&pmap_update_data.lock); + cpumap_copy(&pmap_update_data.cpumap, &pmap->cpumap); + cpu = cpu_id(); + + cpumap_for_each(&pmap_update_data.cpumap, i) + if ((unsigned int)i != cpu) + pmap_update_data.cpu_datas[i].updated = 0; + pmap_update_data.pmap = pmap; pmap_update_data.start = start; pmap_update_data.end = end; - pmap_update_data.nr_pending_updates = nr_cpus - 1; mb_store(); - lapic_ipi_broadcast(TRAP_PMAP_UPDATE); - /* - * Perform the local update now so that some time is given to the other - * processors, which slightly reduces contention on the update counter. - */ + if (pmap == kernel_pmap) + lapic_ipi_broadcast(TRAP_PMAP_UPDATE); + else + cpumap_for_each(&pmap_update_data.cpumap, i) + if ((unsigned int)i != cpu) + lapic_ipi_send(i, TRAP_PMAP_UPDATE); + pmap_update_local(pmap, start, end); - while (pmap_update_data.nr_pending_updates != 0) - cpu_pause(); + cpumap_for_each(&pmap_update_data.cpumap, i) + if ((unsigned int)i != cpu) + while (!pmap_update_data.cpu_datas[i].updated) + cpu_pause(); spinlock_unlock(&pmap_update_data.lock); } @@ -704,7 +720,7 @@ pmap_update_intr(struct trap_frame *frame) /* Interrupts are serializing events, no memory barrier required */ pmap_update_local(pmap_update_data.pmap, pmap_update_data.start, pmap_update_data.end); - atomic_add(&pmap_update_data.nr_pending_updates, -1); + pmap_update_data.cpu_datas[cpu_id()].updated = 1; } #ifdef X86_PAE @@ -818,6 +834,7 @@ pmap_create(struct pmap **pmapp) pmap_unmap_pt(); mutex_init(&pmap->lock); + cpumap_zero(&pmap->cpumap); mutex_lock(&pmap_list_lock); list_insert_tail(&pmap_list, &pmap->node); @@ -839,10 +856,40 @@ error_pmap: void pmap_load(struct pmap *pmap) { + struct pmap *prev; + unsigned int cpu; + assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); - cpu_percpu_set_pmap(pmap); + prev = pmap_current(); + + if (prev == pmap) + return; + + cpu = cpu_id(); + + /* + * The kernel pmap is considered always loaded on every processor. As a + * result, its CPU map is never changed. In addition, don't bother + * flushing the TLB when switching to a kernel thread, which results in + * a form of lazy TLB invalidation. + * + * TODO As an exception, force switching when the currently loaded pmap + * is about to be destroyed. + */ + if (prev == kernel_pmap) { + cpu_percpu_set_pmap(pmap); + cpumap_set_atomic(&pmap->cpumap, cpu); + } else if (pmap == kernel_pmap) { + cpumap_clear_atomic(&prev->cpumap, cpu); + cpu_percpu_set_pmap(kernel_pmap); + return; + } else { + cpumap_clear_atomic(&prev->cpumap, cpu); + cpu_percpu_set_pmap(pmap); + cpumap_set_atomic(&pmap->cpumap, cpu); + } #ifdef X86_PAE cpu_set_cr3(pmap->pdpt_pa); diff --git a/arch/x86/machine/pmap.h b/arch/x86/machine/pmap.h index c2ea6348..32a56c05 100644 --- a/arch/x86/machine/pmap.h +++ b/arch/x86/machine/pmap.h @@ -104,6 +104,8 @@ typedef unsigned long pmap_pte_t; /* * Physical address map. + * + * TODO Define locking protocol. */ struct pmap { struct mutex lock; @@ -113,6 +115,9 @@ struct pmap { pmap_pte_t *pdpt; unsigned long pdpt_pa; #endif /* X86_PAE */ + + /* Processors on which this pmap is loaded */ + struct cpumap cpumap; }; /* diff --git a/kern/thread.c b/kern/thread.c index f19ec853..3ee8da93 100644 --- a/kern/thread.c +++ b/kern/thread.c @@ -473,8 +473,7 @@ thread_runq_schedule(struct thread_runq *runq, struct thread *prev) assert((next != runq->idler) || (runq->nr_threads == 0)); if (prev != next) { - if ((prev->task != next->task) && (next->task != kernel_task)) - pmap_load(next->task->map->pmap); + pmap_load(next->task->map->pmap); /* * That's where the true context switch occurs. The next thread must @@ -1893,9 +1892,7 @@ thread_run(void) spinlock_lock(&runq->lock); thread = thread_runq_get_next(thread_runq_local()); - if (thread->task != kernel_task) - pmap_load(thread->task->map->pmap); - + pmap_load(thread->task->map->pmap); tcb_load(&thread->tcb); } |