/* * Copyright (c) 2010-2017 Richard Braun. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * * TODO Review locking. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Properties of a page translation level. */ struct pmap_pt_level { unsigned int skip; unsigned int bits; unsigned int ptes_per_pt; pmap_pte_t mask; }; /* * Table of page translation properties. */ static struct pmap_pt_level pmap_pt_levels[] __read_mostly = { { PMAP_L0_SKIP, PMAP_L0_BITS, PMAP_L0_PTES_PER_PT, PMAP_L0_MASK }, { PMAP_L1_SKIP, PMAP_L1_BITS, PMAP_L1_PTES_PER_PT, PMAP_L1_MASK }, #if PMAP_NR_LEVELS > 2 { PMAP_L2_SKIP, PMAP_L2_BITS, PMAP_L2_PTES_PER_PT, PMAP_L2_MASK }, #if PMAP_NR_LEVELS > 3 { PMAP_L3_SKIP, PMAP_L3_BITS, PMAP_L3_PTES_PER_PT, PMAP_L3_MASK }, #endif /* PMAP_NR_LEVELS > 3 */ #endif /* PMAP_NR_LEVELS > 2 */ }; /* * Per-CPU page tables. */ struct pmap_cpu_table { struct list node; phys_addr_t root_ptp_pa; }; struct pmap { struct pmap_cpu_table *cpu_tables[CONFIG_MAX_CPUS]; }; /* * Type for page table walking functions. * * See pmap_walk_vas(). */ typedef void (*pmap_walk_fn_t)(phys_addr_t pa, unsigned int index, unsigned int level); /* * The kernel per-CPU page tables are used early enough during bootstrap * that using a percpu variable would actually become ugly. This array * is rather small anyway. */ static struct pmap_cpu_table pmap_kernel_cpu_tables[CONFIG_MAX_CPUS] __read_mostly; struct pmap pmap_kernel_pmap; struct pmap *pmap_current_ptr __percpu; #ifdef CONFIG_X86_PAE /* * Alignment required on page directory pointer tables. */ #define PMAP_PDPT_ALIGN 32 /* * "Hidden" kernel root page tables for PAE mode. */ static alignas(PMAP_PDPT_ALIGN) pmap_pte_t pmap_cpu_kpdpts[CONFIG_MAX_CPUS][PMAP_L2_PTES_PER_PT] __read_mostly; #endif /* CONFIG_X86_PAE */ /* * Flags related to page protection. */ #define PMAP_PTE_PROT_MASK PMAP_PTE_RW /* * Table used to convert machine independent protection flags to architecture * specific PTE bits. */ static pmap_pte_t pmap_prot_table[VM_PROT_ALL + 1] __read_mostly; /* * Structures related to inter-processor page table updates. */ #define PMAP_UPDATE_OP_ENTER 1 #define PMAP_UPDATE_OP_REMOVE 2 #define PMAP_UPDATE_OP_PROTECT 3 struct pmap_update_enter_args { uintptr_t va; phys_addr_t pa; int prot; int flags; }; struct pmap_update_remove_args { uintptr_t start; uintptr_t end; }; struct pmap_update_protect_args { uintptr_t start; uintptr_t end; int prot; }; struct pmap_update_op { struct cpumap cpumap; unsigned int operation; union { struct pmap_update_enter_args enter_args; struct pmap_update_remove_args remove_args; struct pmap_update_protect_args protect_args; }; }; /* * Maximum number of operations that can be batched before an implicit * update. */ #define PMAP_UPDATE_MAX_OPS 32 /* * List of update operations. * * A list of update operations is a container of operations that are pending * for a pmap. Updating can be implicit, e.g. when a list has reached its * maximum size, or explicit, when pmap_update() is called. Operation lists * are thread-local objects. * * The cpumap is the union of all processors affected by at least one * operation. */ struct pmap_update_oplist { alignas(CPU_L1_SIZE) struct cpumap cpumap; struct pmap *pmap; unsigned int nr_ops; struct pmap_update_op ops[PMAP_UPDATE_MAX_OPS]; }; /* * Statically allocated data for the main booter thread. */ static struct cpumap pmap_booter_cpumap __initdata; static struct pmap_update_oplist pmap_booter_oplist __initdata; /* * Each regular thread gets an operation list from this cache. */ static struct kmem_cache pmap_update_oplist_cache; /* * Queue holding update requests from remote processors. */ struct pmap_update_queue { struct spinlock lock; struct list requests; }; /* * Syncer thread. * * There is one such thread per processor. They are the recipients of * update requests, providing thread context for the mapping operations * they perform. */ struct pmap_syncer { alignas(CPU_L1_SIZE) struct thread *thread; struct pmap_update_queue queue; struct syscnt sc_updates; struct syscnt sc_update_enters; struct syscnt sc_update_removes; struct syscnt sc_update_protects; }; static void pmap_sync(void *arg); static struct pmap_syncer pmap_syncer __percpu; /* * Maximum number of mappings for which individual TLB invalidations can be * performed. Global TLB flushes are done beyond this value. */ #define PMAP_UPDATE_MAX_MAPPINGS 64 /* * Per processor request, queued on a remote processor. * * The number of mappings is used to determine whether it's best to flush * individual TLB entries or globally flush the TLB. */ struct pmap_update_request { alignas(CPU_L1_SIZE) struct list node; struct spinlock lock; struct thread *sender; const struct pmap_update_oplist *oplist; unsigned int nr_mappings; int done; int error; }; /* * Per processor array of requests. * * When an operation list is to be applied, the thread triggering the update * acquires the processor-local array of requests and uses it to queue requests * on remote processors. */ struct pmap_update_request_array { struct pmap_update_request requests[CONFIG_MAX_CPUS]; struct mutex lock; }; static struct pmap_update_request_array pmap_update_request_array __percpu; static int pmap_do_remote_updates __read_mostly; static struct kmem_cache pmap_cache; #ifdef CONFIG_X86_PAE static char pmap_panic_no_pae[] __bootdata = "pmap: PAE not supported"; #endif /* CONFIG_X86_PAE */ static char pmap_panic_inval_msg[] __bootdata = "pmap: invalid physical address"; static char pmap_panic_directmap_msg[] __bootdata = "pmap: invalid direct physical mapping"; static __always_inline unsigned long pmap_pte_index(uintptr_t va, const struct pmap_pt_level *pt_level) { return ((va >> pt_level->skip) & ((1UL << pt_level->bits) - 1)); } static void __boot pmap_boot_enter(pmap_pte_t *root_ptp, uintptr_t va, phys_addr_t pa, unsigned long pgsize) { const struct pmap_pt_level *pt_level, *pt_levels; pmap_pte_t *pt, *ptp, *pte, bits; unsigned int level, last_level; if (pa != (pa & PMAP_PA_MASK)) { boot_panic(pmap_panic_inval_msg); } switch (pgsize) { #ifdef __LP64__ case (1 << PMAP_L2_SKIP): bits = PMAP_PTE_PS; last_level = 2; break; #endif /* __LP64__ */ case (1 << PMAP_L1_SKIP): bits = PMAP_PTE_PS; last_level = 1; break; default: bits = 0; last_level = 0; } pt_levels = (void *)BOOT_VTOP((uintptr_t)pmap_pt_levels); pt = root_ptp; for (level = PMAP_NR_LEVELS - 1; level != last_level; level--) { pt_level = &pt_levels[level]; pte = &pt[pmap_pte_index(va, pt_level)]; if (*pte != 0) { ptp = (void *)(uintptr_t)(*pte & PMAP_PA_MASK); } else { ptp = biosmem_bootalloc(1); *pte = ((uintptr_t)ptp | PMAP_PTE_RW | PMAP_PTE_P) & pt_level->mask; } pt = ptp; } pt_level = &pt_levels[last_level]; pte = &pt[pmap_pte_index(va, pt_level)]; *pte = (pa & PMAP_PA_MASK) | PMAP_PTE_RW | PMAP_PTE_P | bits; } static unsigned long __boot pmap_boot_get_pgsize(void) { unsigned int eax, ebx, ecx, edx; #ifdef __LP64__ eax = 0x80000000; cpu_cpuid(&eax, &ebx, &ecx, &edx); if (eax <= 0x80000000) { goto out; } eax = 0x80000001; cpu_cpuid(&eax, &ebx, &ecx, &edx); if (edx & CPU_FEATURE4_1GP) { return (1 << PMAP_L2_SKIP); } out: return (1 << PMAP_L1_SKIP); #else /* __LP64__ */ eax = 0; cpu_cpuid(&eax, &ebx, &ecx, &edx); if (eax == 0) { goto out; } eax = 1; cpu_cpuid(&eax, &ebx, &ecx, &edx); #ifdef CONFIG_X86_PAE if (!(edx & CPU_FEATURE2_PAE)) { boot_panic(pmap_panic_no_pae); } return (1 << PMAP_L1_SKIP); #else /* CONFIG_X86_PAE */ if (edx & CPU_FEATURE2_PSE) { return (1 << PMAP_L1_SKIP); } #endif /* CONFIG_X86_PAE */ out: return PAGE_SIZE; #endif /* __LP64__ */ } #ifdef __LP64__ #define pmap_boot_enable_pgext(pgsize) ((void)(pgsize)) #else /* __LP64__ */ static void __boot pmap_boot_enable_pgext(unsigned long pgsize) { if (pgsize == PAGE_SIZE) { return; } /* * On 64-bits systems, PAE is already enabled. * * See the boot module. */ #ifdef CONFIG_X86_PAE cpu_enable_pae(); #else /* CONFIG_X86_PAE */ cpu_enable_pse(); #endif /* CONFIG_X86_PAE */ } #endif /* __LP64__ */ pmap_pte_t * __boot pmap_setup_paging(void) { struct pmap_cpu_table *cpu_table; phys_addr_t pa, directmap_end; unsigned long i, size, pgsize; pmap_pte_t *root_ptp; uintptr_t va; /* Use large pages for the direct physical mapping when possible */ pgsize = pmap_boot_get_pgsize(); pmap_boot_enable_pgext(pgsize); /* * Create the initial mappings. The first is for the .boot section * and acts as the mandatory identity mapping. The second is the * direct physical mapping of physical memory. */ #ifdef CONFIG_X86_PAE root_ptp = (void *)BOOT_VTOP((uintptr_t)pmap_cpu_kpdpts[0]); #else /* CONFIG_X86_PAE */ root_ptp = biosmem_bootalloc(1); #endif /* CONFIG_X86_PAE */ va = vm_page_trunc((uintptr_t)&_boot); pa = va; size = vm_page_round((uintptr_t)&_boot_end) - va; for (i = 0; i < size; i += PAGE_SIZE) { pmap_boot_enter(root_ptp, va, pa, PAGE_SIZE); va += PAGE_SIZE; pa += PAGE_SIZE; } directmap_end = biosmem_directmap_end(); if (directmap_end > (PMAP_END_DIRECTMAP_ADDRESS - PMAP_START_DIRECTMAP_ADDRESS)) { boot_panic(pmap_panic_directmap_msg); } va = PMAP_START_DIRECTMAP_ADDRESS; pa = 0; for (i = 0; i < directmap_end; i += pgsize) { pmap_boot_enter(root_ptp, va, pa, pgsize); va += pgsize; pa += pgsize; } #ifdef __LP64__ /* * On 64-bits systems, the kernel isn't linked at addresses included * in the direct mapping, which requires the creation of an additional * mapping for it. */ va = P2ALIGN((uintptr_t)&_init, pgsize); pa = BOOT_VTOP(va); size = vm_page_round((uintptr_t)&_end) - va; for (i = 0; i < size; i += pgsize) { pmap_boot_enter(root_ptp, va, pa, pgsize); va += pgsize; pa += pgsize; } #endif /* __LP64__ */ cpu_table = (void *)BOOT_VTOP((uintptr_t)&pmap_kernel_cpu_tables[0]); cpu_table->root_ptp_pa = (uintptr_t)root_ptp; return root_ptp; } pmap_pte_t * __boot pmap_ap_setup_paging(void) { struct pmap_cpu_table *cpu_table; struct pmap *pmap; unsigned long pgsize; pgsize = pmap_boot_get_pgsize(); pmap_boot_enable_pgext(pgsize); pmap = (void *)BOOT_VTOP((uintptr_t)&pmap_kernel_pmap); cpu_table = (void *)BOOT_VTOP((uintptr_t)pmap->cpu_tables[boot_ap_id]); #ifdef CONFIG_X86_PAE return (void *)(uint32_t)cpu_table->root_ptp_pa; #else /* CONFIG_X86_PAE */ return (void *)cpu_table->root_ptp_pa; #endif /* CONFIG_X86_PAE */ } /* * Check address range with regard to physical map. */ #define pmap_assert_range(pmap, start, end) \ MACRO_BEGIN \ assert((start) < (end)); \ assert(((end) <= PMAP_START_DIRECTMAP_ADDRESS) \ || ((start) >= PMAP_END_DIRECTMAP_ADDRESS)); \ \ if ((pmap) == pmap_get_kernel_pmap()) { \ assert(((start) >= PMAP_START_KMEM_ADDRESS) \ && ((end) <= PMAP_END_KMEM_ADDRESS)); \ } else { \ assert((end) <= PMAP_END_ADDRESS); \ } \ MACRO_END static inline pmap_pte_t * pmap_ptp_from_pa(phys_addr_t pa) { uintptr_t va; va = vm_page_direct_va(pa); return (pmap_pte_t *)va; } static void pmap_ptp_clear(pmap_pte_t *ptp) { memset(ptp, 0, PAGE_SIZE); } static inline void pmap_pte_set(pmap_pte_t *pte, phys_addr_t pa, pmap_pte_t pte_bits, const struct pmap_pt_level *pt_level) { *pte = ((pa & PMAP_PA_MASK) | PMAP_PTE_P | pte_bits) & pt_level->mask; } static inline void pmap_pte_clear(pmap_pte_t *pte) { *pte = 0; } static inline int pmap_pte_valid(pmap_pte_t pte) { return (pte != 0); } static inline int pmap_pte_large(pmap_pte_t pte) { return ((pte & PMAP_PTE_PS) != 0); } static inline pmap_pte_t * pmap_pte_next(pmap_pte_t pte) { assert(pmap_pte_valid(pte)); return pmap_ptp_from_pa(pte & PMAP_PA_MASK); } /* * Helper function for initialization procedures that require post-fixing * page properties. */ static void __init pmap_walk_vas(uintptr_t start, uintptr_t end, pmap_walk_fn_t walk_fn) { const struct pmap_pt_level *pt_level; phys_addr_t root_ptp_pa, ptp_pa; pmap_pte_t *ptp, *pte; unsigned int index, level; uintptr_t va; assert(vm_page_aligned(start)); assert(start < end); #ifdef __LP64__ assert((start < PMAP_END_ADDRESS) || (start >= PMAP_START_KERNEL_ADDRESS)); #endif /* __LP64__ */ va = start; root_ptp_pa = pmap_get_kernel_pmap()->cpu_tables[cpu_id()]->root_ptp_pa; do { #ifdef __LP64__ /* Handle long mode canonical form */ if (va == PMAP_END_ADDRESS) { va = PMAP_START_KERNEL_ADDRESS; } #endif /* __LP64__ */ level = PMAP_NR_LEVELS - 1; ptp_pa = root_ptp_pa; ptp = pmap_ptp_from_pa(ptp_pa); for (;;) { pt_level = &pmap_pt_levels[level]; index = pmap_pte_index(va, pt_level); pte = &ptp[index]; if (!pmap_pte_valid(*pte)) { break; } walk_fn(ptp_pa, index, level); if ((level == 0) || pmap_pte_large(*pte)) { break; } level--; ptp_pa = *pte & PMAP_PA_MASK; ptp = pmap_ptp_from_pa(ptp_pa); } va = P2END(va, 1UL << pt_level->skip); } while ((va > start) && (va < end)); } static void __init pmap_setup_global_page(phys_addr_t ptp_pa, unsigned int index, unsigned int level) { pmap_pte_t *pte; pte = &pmap_ptp_from_pa(ptp_pa)[index]; if ((level == 0) || pmap_pte_large(*pte)) { *pte |= PMAP_PTE_G; } } static void __init pmap_setup_global_pages(void) { pmap_walk_vas(PMAP_START_KERNEL_ADDRESS, PMAP_END_KERNEL_ADDRESS, pmap_setup_global_page); pmap_pt_levels[0].mask |= PMAP_PTE_G; cpu_enable_global_pages(); } static void pmap_update_oplist_ctor(void *arg) { struct pmap_update_oplist *oplist; oplist = arg; cpumap_zero(&oplist->cpumap); oplist->pmap = NULL; oplist->nr_ops = 0; } static int pmap_update_oplist_create(struct pmap_update_oplist **oplistp) { struct pmap_update_oplist *oplist; oplist = kmem_cache_alloc(&pmap_update_oplist_cache); if (oplist == NULL) { return ENOMEM; } *oplistp = oplist; return 0; } static void pmap_update_oplist_destroy(struct pmap_update_oplist *oplist) { kmem_cache_free(&pmap_update_oplist_cache, oplist); } static struct pmap_update_oplist * pmap_update_oplist_get(void) { struct pmap_update_oplist *oplist; oplist = tcb_get_pmap_update_oplist(tcb_current()); assert(oplist != NULL); return oplist; } static int pmap_update_oplist_prepare(struct pmap_update_oplist *oplist, struct pmap *pmap) { int error; if (oplist->pmap != pmap) { assert(oplist->pmap == NULL); oplist->pmap = pmap; } else if (oplist->nr_ops == ARRAY_SIZE(oplist->ops)) { error = pmap_update(pmap); oplist->pmap = pmap; return error; } return 0; } static struct pmap_update_op * pmap_update_oplist_prev_op(struct pmap_update_oplist *oplist) { if (oplist->nr_ops == 0) { return NULL; } return &oplist->ops[oplist->nr_ops - 1]; } static struct pmap_update_op * pmap_update_oplist_prepare_op(struct pmap_update_oplist *oplist) { assert(oplist->nr_ops < ARRAY_SIZE(oplist->ops)); return &oplist->ops[oplist->nr_ops]; } static void pmap_update_oplist_finish_op(struct pmap_update_oplist *oplist) { struct pmap_update_op *op; assert(oplist->nr_ops < ARRAY_SIZE(oplist->ops)); op = &oplist->ops[oplist->nr_ops]; cpumap_or(&oplist->cpumap, &op->cpumap); oplist->nr_ops++; } static unsigned int pmap_update_oplist_count_mappings(const struct pmap_update_oplist *oplist, unsigned int cpu) { const struct pmap_update_op *op; unsigned int i, nr_mappings; nr_mappings = 0; for (i = 0; i < oplist->nr_ops; i++) { op = &oplist->ops[i]; if (!cpumap_test(&op->cpumap, cpu)) { continue; } switch (op->operation) { case PMAP_UPDATE_OP_ENTER: nr_mappings++; break; case PMAP_UPDATE_OP_REMOVE: nr_mappings += (op->remove_args.end - op->remove_args.start) / PAGE_SIZE; break; case PMAP_UPDATE_OP_PROTECT: nr_mappings += (op->protect_args.end - op->protect_args.start) / PAGE_SIZE; break; default: assert(!"invalid update operation"); } } assert(nr_mappings != 0); return nr_mappings; } static void pmap_update_request_array_init(struct pmap_update_request_array *array) { struct pmap_update_request *request; unsigned int i; for (i = 0; i < ARRAY_SIZE(array->requests); i++) { request = &array->requests[i]; spinlock_init(&request->lock); } mutex_init(&array->lock); } static struct pmap_update_request_array * pmap_update_request_array_acquire(void) { struct pmap_update_request_array *array; thread_pin(); array = cpu_local_ptr(pmap_update_request_array); mutex_lock(&array->lock); return array; } static void pmap_update_request_array_release(struct pmap_update_request_array *array) { mutex_unlock(&array->lock); thread_unpin(); } static void __init pmap_syncer_init(struct pmap_syncer *syncer, unsigned int cpu) { char name[SYSCNT_NAME_SIZE]; struct pmap_update_queue *queue; queue = &syncer->queue; spinlock_init(&queue->lock); list_init(&queue->requests); snprintf(name, sizeof(name), "pmap_updates/%u", cpu); syscnt_register(&syncer->sc_updates, name); snprintf(name, sizeof(name), "pmap_update_enters/%u", cpu); syscnt_register(&syncer->sc_update_enters, name); snprintf(name, sizeof(name), "pmap_update_removes/%u", cpu); syscnt_register(&syncer->sc_update_removes, name); snprintf(name, sizeof(name), "pmap_update_protects/%u", cpu); syscnt_register(&syncer->sc_update_protects, name); } static int __init pmap_bootstrap(void) { struct pmap_cpu_table *cpu_table; unsigned int i; for (i = 0; i < ARRAY_SIZE(pmap_get_kernel_pmap()->cpu_tables); i++) { cpu_table = &pmap_kernel_cpu_tables[i]; pmap_get_kernel_pmap()->cpu_tables[i] = cpu_table; } cpu_local_assign(pmap_current_ptr, pmap_get_kernel_pmap()); pmap_prot_table[VM_PROT_NONE] = 0; pmap_prot_table[VM_PROT_READ] = 0; pmap_prot_table[VM_PROT_WRITE] = PMAP_PTE_RW; pmap_prot_table[VM_PROT_WRITE | VM_PROT_READ] = PMAP_PTE_RW; pmap_prot_table[VM_PROT_EXECUTE] = 0; pmap_prot_table[VM_PROT_EXECUTE | VM_PROT_READ] = 0; pmap_prot_table[VM_PROT_EXECUTE | VM_PROT_WRITE] = PMAP_PTE_RW; pmap_prot_table[VM_PROT_ALL] = PMAP_PTE_RW; pmap_update_request_array_init(cpu_local_ptr(pmap_update_request_array)); pmap_syncer_init(cpu_local_ptr(pmap_syncer), 0); pmap_update_oplist_ctor(&pmap_booter_oplist); tcb_set_pmap_update_oplist(tcb_current(), &pmap_booter_oplist); cpumap_zero(&pmap_booter_cpumap); cpumap_set(&pmap_booter_cpumap, 0); if (cpu_has_global_pages()) { pmap_setup_global_pages(); } return 0; } INIT_OP_DEFINE(pmap_bootstrap, INIT_OP_DEP(cpu_setup, true), INIT_OP_DEP(mutex_setup, true), INIT_OP_DEP(spinlock_setup, true), INIT_OP_DEP(syscnt_setup, true), INIT_OP_DEP(thread_bootstrap, true)); static void __init pmap_setup_set_ptp_type(phys_addr_t ptp_pa, unsigned int index, unsigned int level) { struct vm_page *page; (void)index; if (level == 0) { return; } page = vm_page_lookup(ptp_pa); assert(page != NULL); if (vm_page_type(page) != VM_PAGE_PMAP) { assert(vm_page_type(page) == VM_PAGE_RESERVED); vm_page_set_type(page, 0, VM_PAGE_PMAP); } } static void __init pmap_setup_fix_ptps(void) { pmap_walk_vas(PMAP_START_ADDRESS, PMAP_END_KERNEL_ADDRESS, pmap_setup_set_ptp_type); } static int __init pmap_setup(void) { pmap_setup_fix_ptps(); kmem_cache_init(&pmap_cache, "pmap", sizeof(struct pmap), 0, NULL, 0); kmem_cache_init(&pmap_update_oplist_cache, "pmap_update_oplist", sizeof(struct pmap_update_oplist), CPU_L1_SIZE, pmap_update_oplist_ctor, 0); return 0; } INIT_OP_DEFINE(pmap_setup, INIT_OP_DEP(kmem_setup, true), INIT_OP_DEP(log_setup, true), INIT_OP_DEP(pmap_bootstrap, true), INIT_OP_DEP(vm_page_setup, true)); void __init pmap_ap_setup(void) { cpu_local_assign(pmap_current_ptr, pmap_get_kernel_pmap()); if (cpu_has_global_pages()) { cpu_enable_global_pages(); } else { cpu_tlb_flush(); } } static void __init pmap_copy_cpu_table_page(const pmap_pte_t *sptp, unsigned int level, struct vm_page *page) { const struct pmap_pt_level *pt_level; pmap_pte_t *dptp; pt_level = &pmap_pt_levels[level]; dptp = vm_page_direct_ptr(page); memcpy(dptp, sptp, pt_level->ptes_per_pt * sizeof(pmap_pte_t)); } static void __init pmap_copy_cpu_table_recursive(const pmap_pte_t *sptp, unsigned int level, pmap_pte_t *dptp, uintptr_t start_va) { const struct pmap_pt_level *pt_level; struct vm_page *page; phys_addr_t pa; unsigned int i; uintptr_t va; assert(level != 0); pt_level = &pmap_pt_levels[level]; memset(dptp, 0, pt_level->ptes_per_pt * sizeof(pmap_pte_t)); for (i = 0, va = start_va; i < pt_level->ptes_per_pt; i++, va = P2END(va, 1UL << pt_level->skip)) { #ifdef __LP64__ /* Handle long mode canonical form */ if (va == PMAP_END_ADDRESS) { va = PMAP_START_KERNEL_ADDRESS; } #endif /* __LP64__ */ if (!pmap_pte_valid(sptp[i])) { continue; } else if (pmap_pte_large(sptp[i])) { dptp[i] = sptp[i]; continue; } page = vm_page_alloc(0, VM_PAGE_SEL_DIRECTMAP, VM_PAGE_PMAP); if (page == NULL) { panic("pmap: unable to allocate page table page copy"); } pa = vm_page_to_pa(page); dptp[i] = (sptp[i] & ~PMAP_PA_MASK) | (pa & PMAP_PA_MASK); if (((level - 1) == 0) || pmap_pte_large(sptp[i])) { pmap_copy_cpu_table_page(pmap_pte_next(sptp[i]), level - 1, page); } else { pmap_copy_cpu_table_recursive(pmap_pte_next(sptp[i]), level - 1, vm_page_direct_ptr(page), va); } } } static void __init pmap_copy_cpu_table(unsigned int cpu) { struct pmap_cpu_table *cpu_table; struct pmap *kernel_pmap; unsigned int level; const pmap_pte_t *sptp; pmap_pte_t *dptp; assert(cpu != 0); kernel_pmap = pmap_get_kernel_pmap(); assert(cpu < ARRAY_SIZE(kernel_pmap->cpu_tables)); cpu_table = kernel_pmap->cpu_tables[cpu]; level = PMAP_NR_LEVELS - 1; sptp = pmap_ptp_from_pa(kernel_pmap->cpu_tables[cpu_id()]->root_ptp_pa); #ifdef CONFIG_X86_PAE cpu_table->root_ptp_pa = BOOT_VTOP((uintptr_t)pmap_cpu_kpdpts[cpu]); dptp = pmap_ptp_from_pa(cpu_table->root_ptp_pa); #else /* CONFIG_X86_PAE */ struct vm_page *page; page = vm_page_alloc(0, VM_PAGE_SEL_DIRECTMAP, VM_PAGE_PMAP); if (page == NULL) { panic("pmap: unable to allocate page table root page copy"); } cpu_table->root_ptp_pa = vm_page_to_pa(page); dptp = vm_page_direct_ptr(page); #endif /* CONFIG_X86_PAE */ pmap_copy_cpu_table_recursive(sptp, level, dptp, PMAP_START_ADDRESS); } void __init pmap_mp_setup(void) { char name[THREAD_NAME_SIZE]; struct pmap_update_oplist *oplist; struct thread_attr attr; struct pmap_syncer *syncer; struct cpumap *cpumap; struct tcb *tcb; unsigned int cpu; int error; error = cpumap_create(&cpumap); if (error) { panic("pmap: unable to create syncer cpumap"); } for (cpu = 1; cpu < cpu_count(); cpu++) { pmap_update_request_array_init(percpu_ptr(pmap_update_request_array, cpu)); pmap_syncer_init(percpu_ptr(pmap_syncer, cpu), cpu); } for (cpu = 0; cpu < cpu_count(); cpu++) { syncer = percpu_ptr(pmap_syncer, cpu); snprintf(name, sizeof(name), THREAD_KERNEL_PREFIX "pmap_sync/%u", cpu); cpumap_zero(cpumap); cpumap_set(cpumap, cpu); thread_attr_init(&attr, name); thread_attr_set_cpumap(&attr, cpumap); thread_attr_set_priority(&attr, THREAD_SCHED_FS_PRIO_MAX); error = thread_create(&syncer->thread, &attr, pmap_sync, syncer); if (error) { panic("pmap: unable to create syncer thread"); } tcb = thread_get_tcb(syncer->thread); oplist = tcb_get_pmap_update_oplist(tcb); tcb_set_pmap_update_oplist(tcb, NULL); kmem_cache_free(&pmap_update_oplist_cache, oplist); } cpumap_destroy(cpumap); for (cpu = 1; cpu < cpu_count(); cpu++) { pmap_copy_cpu_table(cpu); } pmap_do_remote_updates = 1; } int pmap_thread_build(struct thread *thread) { struct pmap_update_oplist *oplist; int error; error = pmap_update_oplist_create(&oplist); if (error) { return error; } tcb_set_pmap_update_oplist(thread_get_tcb(thread), oplist); return 0; } void pmap_thread_cleanup(struct thread *thread) { struct pmap_update_oplist *oplist; oplist = tcb_get_pmap_update_oplist(thread_get_tcb(thread)); if (oplist) { pmap_update_oplist_destroy(oplist); } } int pmap_kextract(uintptr_t va, phys_addr_t *pap) { const struct pmap_pt_level *pt_level; struct pmap *kernel_pmap; pmap_pte_t *ptp, *pte; unsigned int level; level = PMAP_NR_LEVELS - 1; kernel_pmap = pmap_get_kernel_pmap(); ptp = pmap_ptp_from_pa(kernel_pmap->cpu_tables[cpu_id()]->root_ptp_pa); for (;;) { pt_level = &pmap_pt_levels[level]; pte = &ptp[pmap_pte_index(va, pt_level)]; if (!pmap_pte_valid(*pte)) { return EFAULT; } if ((level == 0) || pmap_pte_large(*pte)) { break; } level--; ptp = pmap_pte_next(*pte); } *pap = (*pte & PMAP_PA_MASK); return 0; } int pmap_create(struct pmap **pmapp) { struct pmap *pmap; unsigned int i; pmap = kmem_cache_alloc(&pmap_cache); if (pmap == NULL) { return ENOMEM; } for (i = 0; i < ARRAY_SIZE(pmap->cpu_tables); i++) { pmap->cpu_tables[i] = NULL; } *pmapp = pmap; return 0; } static int pmap_enter_local(struct pmap *pmap, uintptr_t va, phys_addr_t pa, int prot, int flags) { const struct pmap_pt_level *pt_level; struct vm_page *page; phys_addr_t ptp_pa; pmap_pte_t *ptp, *pte, pte_bits; unsigned int level; /* TODO Page attributes */ (void)flags; pte_bits = PMAP_PTE_RW; if (pmap != pmap_get_kernel_pmap()) { pte_bits |= PMAP_PTE_US; } level = PMAP_NR_LEVELS - 1; ptp = pmap_ptp_from_pa(pmap->cpu_tables[cpu_id()]->root_ptp_pa); for (;;) { pt_level = &pmap_pt_levels[level]; pte = &ptp[pmap_pte_index(va, pt_level)]; if (level == 0) { break; } if (pmap_pte_valid(*pte)) { ptp = pmap_pte_next(*pte); } else { page = vm_page_alloc(0, VM_PAGE_SEL_DIRECTMAP, VM_PAGE_PMAP); if (page == NULL) { log_warning("pmap: page table page allocation failure"); return ENOMEM; } ptp_pa = vm_page_to_pa(page); ptp = pmap_ptp_from_pa(ptp_pa); pmap_ptp_clear(ptp); pmap_pte_set(pte, ptp_pa, pte_bits, pt_level); } level--; } assert(!pmap_pte_valid(*pte)); pte_bits = ((pmap == pmap_get_kernel_pmap()) ? PMAP_PTE_G : PMAP_PTE_US) | pmap_prot_table[prot & VM_PROT_ALL]; pmap_pte_set(pte, pa, pte_bits, pt_level); return 0; } int pmap_enter(struct pmap *pmap, uintptr_t va, phys_addr_t pa, int prot, int flags) { struct pmap_update_oplist *oplist; struct pmap_update_op *op; int error; va = vm_page_trunc(va); pa = vm_page_trunc(pa); pmap_assert_range(pmap, va, va + PAGE_SIZE); oplist = pmap_update_oplist_get(); error = pmap_update_oplist_prepare(oplist, pmap); if (error) { return error; } op = pmap_update_oplist_prepare_op(oplist); if (flags & PMAP_PEF_GLOBAL) { cpumap_copy(&op->cpumap, cpumap_all()); } else { cpumap_zero(&op->cpumap); cpumap_set(&op->cpumap, cpu_id()); } op->operation = PMAP_UPDATE_OP_ENTER; op->enter_args.va = va; op->enter_args.pa = pa; op->enter_args.prot = prot; op->enter_args.flags = flags & ~PMAP_PEF_GLOBAL; pmap_update_oplist_finish_op(oplist); return 0; } static void pmap_remove_local_single(struct pmap *pmap, uintptr_t va) { const struct pmap_pt_level *pt_level; pmap_pte_t *ptp, *pte; unsigned int level; level = PMAP_NR_LEVELS - 1; ptp = pmap_ptp_from_pa(pmap->cpu_tables[cpu_id()]->root_ptp_pa); for (;;) { pt_level = &pmap_pt_levels[level]; pte = &ptp[pmap_pte_index(va, pt_level)]; if (!pmap_pte_valid(*pte)) { return; } if (level == 0) { break; } level--; ptp = pmap_pte_next(*pte); } pmap_pte_clear(pte); } static void pmap_remove_local(struct pmap *pmap, uintptr_t start, uintptr_t end) { while (start < end) { pmap_remove_local_single(pmap, start); start += PAGE_SIZE; } } int pmap_remove(struct pmap *pmap, uintptr_t va, const struct cpumap *cpumap) { struct pmap_update_oplist *oplist; struct pmap_update_op *op; int error; va = vm_page_trunc(va); pmap_assert_range(pmap, va, va + PAGE_SIZE); oplist = pmap_update_oplist_get(); error = pmap_update_oplist_prepare(oplist, pmap); if (error) { return error; } /* Attempt naive merge with previous operation */ op = pmap_update_oplist_prev_op(oplist); if ((op != NULL) && (op->operation == PMAP_UPDATE_OP_REMOVE) && (op->remove_args.end == va) && (cpumap_cmp(&op->cpumap, cpumap) == 0)) { op->remove_args.end = va + PAGE_SIZE; return 0; } op = pmap_update_oplist_prepare_op(oplist); cpumap_copy(&op->cpumap, cpumap); op->operation = PMAP_UPDATE_OP_REMOVE; op->remove_args.start = va; op->remove_args.end = va + PAGE_SIZE; pmap_update_oplist_finish_op(oplist); return 0; } static void pmap_protect_local(struct pmap *pmap, uintptr_t start, uintptr_t end, int prot) { (void)pmap; (void)start; (void)end; (void)prot; /* TODO Implement */ panic("pmap: pmap_protect not implemented"); } int pmap_protect(struct pmap *pmap, uintptr_t va, int prot, const struct cpumap *cpumap) { struct pmap_update_oplist *oplist; struct pmap_update_op *op; int error; va = vm_page_trunc(va); pmap_assert_range(pmap, va, va + PAGE_SIZE); oplist = pmap_update_oplist_get(); error = pmap_update_oplist_prepare(oplist, pmap); if (error) { return error; } /* Attempt naive merge with previous operation */ op = pmap_update_oplist_prev_op(oplist); if ((op != NULL) && (op->operation == PMAP_UPDATE_OP_PROTECT) && (op->protect_args.end == va) && (op->protect_args.prot == prot) && (cpumap_cmp(&op->cpumap, cpumap) == 0)) { op->protect_args.end = va + PAGE_SIZE; return 0; } op = pmap_update_oplist_prepare_op(oplist); cpumap_copy(&op->cpumap, cpumap); op->operation = PMAP_UPDATE_OP_PROTECT; op->protect_args.start = va; op->protect_args.end = va + PAGE_SIZE; op->protect_args.prot = prot; pmap_update_oplist_finish_op(oplist); return 0; } static void pmap_flush_tlb(struct pmap *pmap, uintptr_t start, uintptr_t end) { if ((pmap != pmap_current()) && (pmap != pmap_get_kernel_pmap())) { return; } while (start < end) { cpu_tlb_flush_va(start); start += PAGE_SIZE; } } static void pmap_flush_tlb_all(struct pmap *pmap) { if ((pmap != pmap_current()) && (pmap != pmap_get_kernel_pmap())) { return; } if (pmap == pmap_get_kernel_pmap()) { cpu_tlb_flush_all(); } else { cpu_tlb_flush(); } } static int pmap_update_enter(struct pmap *pmap, int flush, const struct pmap_update_enter_args *args) { int error; error = pmap_enter_local(pmap, args->va, args->pa, args->prot, args->flags); if (error) { return error; } if (flush) { pmap_flush_tlb(pmap, args->va, args->va + PAGE_SIZE); } return 0; } static void pmap_update_remove(struct pmap *pmap, int flush, const struct pmap_update_remove_args *args) { pmap_remove_local(pmap, args->start, args->end); if (flush) { pmap_flush_tlb(pmap, args->start, args->end); } } static void pmap_update_protect(struct pmap *pmap, int flush, const struct pmap_update_protect_args *args) { pmap_protect_local(pmap, args->start, args->end, args->prot); if (flush) { pmap_flush_tlb(pmap, args->start, args->end); } } static int pmap_update_local(const struct pmap_update_oplist *oplist, unsigned int nr_mappings) { const struct pmap_update_op *op; struct pmap_syncer *syncer; int error, global_tlb_flush; unsigned int i; syncer = cpu_local_ptr(pmap_syncer); syscnt_inc(&syncer->sc_updates); global_tlb_flush = (nr_mappings > PMAP_UPDATE_MAX_MAPPINGS); error = 0; for (i = 0; i < oplist->nr_ops; i++) { op = &oplist->ops[i]; if (!cpumap_test(&op->cpumap, cpu_id())) { continue; } switch (op->operation) { case PMAP_UPDATE_OP_ENTER: syscnt_inc(&syncer->sc_update_enters); error = pmap_update_enter(oplist->pmap, !global_tlb_flush, &op->enter_args); break; case PMAP_UPDATE_OP_REMOVE: syscnt_inc(&syncer->sc_update_removes); pmap_update_remove(oplist->pmap, !global_tlb_flush, &op->remove_args); break; case PMAP_UPDATE_OP_PROTECT: syscnt_inc(&syncer->sc_update_protects); pmap_update_protect(oplist->pmap, !global_tlb_flush, &op->protect_args); break; default: assert(!"invalid update operation"); } if (error) { return error; } } if (global_tlb_flush) { pmap_flush_tlb_all(oplist->pmap); } return 0; } int pmap_update(struct pmap *pmap) { struct pmap_update_oplist *oplist; struct pmap_update_request_array *array; struct pmap_update_request *request; struct pmap_update_queue *queue; struct pmap_syncer *syncer; unsigned int nr_mappings; int error, cpu; oplist = pmap_update_oplist_get(); if (pmap != oplist->pmap) { /* Make sure pmap_update() is called before manipulating another pmap */ assert(oplist->pmap == NULL); return 0; } assert(oplist->nr_ops != 0); if (!pmap_do_remote_updates) { nr_mappings = pmap_update_oplist_count_mappings(oplist, cpu_id()); error = pmap_update_local(oplist, nr_mappings); goto out; } error = 0; array = pmap_update_request_array_acquire(); cpumap_for_each(&oplist->cpumap, cpu) { syncer = percpu_ptr(pmap_syncer, cpu); queue = &syncer->queue; request = &array->requests[cpu]; request->sender = thread_self(); request->oplist = oplist; request->nr_mappings = pmap_update_oplist_count_mappings(oplist, cpu); request->done = 0; request->error = 0; spinlock_lock(&queue->lock); list_insert_tail(&queue->requests, &request->node); thread_wakeup(syncer->thread); spinlock_unlock(&queue->lock); } /* TODO Improve scalability */ cpumap_for_each(&oplist->cpumap, cpu) { request = &array->requests[cpu]; spinlock_lock(&request->lock); while (!request->done) { thread_sleep(&request->lock, request, "pmaprq"); } if (!error && request->error) { error = request->error; } spinlock_unlock(&request->lock); } pmap_update_request_array_release(array); out: cpumap_zero(&oplist->cpumap); oplist->pmap = NULL; oplist->nr_ops = 0; return error; } static void pmap_sync(void *arg) { struct pmap_update_queue *queue; struct pmap_update_request *request; struct pmap_syncer *self; int error; self = arg; queue = &self->queue; for (;;) { spinlock_lock(&queue->lock); while (list_empty(&queue->requests)) { thread_sleep(&queue->lock, queue, "pmapq"); } request = list_first_entry(&queue->requests, struct pmap_update_request, node); list_remove(&request->node); spinlock_unlock(&queue->lock); error = pmap_update_local(request->oplist, request->nr_mappings); spinlock_lock(&request->lock); request->done = 1; request->error = error; thread_wakeup(request->sender); spinlock_unlock(&request->lock); } } void pmap_load(struct pmap *pmap) { struct pmap_cpu_table *cpu_table; assert(!cpu_intr_enabled()); assert(!thread_preempt_enabled()); if (pmap_current() == pmap) { return; } /* TODO Lazy TLB invalidation */ cpu_local_assign(pmap_current_ptr, pmap); /* TODO Implement per-CPU page tables for non-kernel pmaps */ cpu_table = pmap->cpu_tables[cpu_id()]; cpu_set_cr3(cpu_table->root_ptp_pa); }