/*
* Copyright (c) 2010, 2012 Richard Braun.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*
* TODO Check TLB flushes on the recursive mapping.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define PMAP_PTEMAP_INDEX(va, shift) (((va) & PMAP_VA_MASK) >> (shift))
/*
* Recursive mapping of PTEs.
*/
#define PMAP_PTEMAP_BASE ((pmap_pte_t *)VM_PMAP_PTEMAP_ADDRESS)
#define PMAP_LX_INDEX(shift) PMAP_PTEMAP_INDEX(VM_PMAP_PTEMAP_ADDRESS, shift)
/*
* Base addresses of the page tables for each level in the recursive mapping.
*/
#define PMAP_L1_PTEMAP PMAP_PTEMAP_BASE
#define PMAP_L2_PTEMAP (PMAP_L1_PTEMAP + PMAP_LX_INDEX(PMAP_L1_SHIFT))
#define PMAP_L3_PTEMAP (PMAP_L2_PTEMAP + PMAP_LX_INDEX(PMAP_L2_SHIFT))
#define PMAP_L4_PTEMAP (PMAP_L3_PTEMAP + PMAP_LX_INDEX(PMAP_L3_SHIFT))
/*
* Flags related to page protection.
*/
#define PMAP_PTE_PROT_MASK PMAP_PTE_RW
/*
* Number of pages to reserve for the pmap module after the kernel.
*
* This pool of pure virtual memory can be used to reserve virtual addresses
* before the VM system is initialized.
*/
#define PMAP_RESERVED_PAGES 2
/*
* Properties of a page translation level.
*/
struct pmap_pt_level {
unsigned int bits;
unsigned int shift;
pmap_pte_t *ptes; /* PTEs in the recursive mapping */
pmap_pte_t mask;
};
#ifdef X86_PAE
/*
* "Hidden" root page table for PAE mode.
*/
static pmap_pte_t pmap_boot_pdpt[PMAP_NR_RPTPS] __aligned(32) __initdata;
static pmap_pte_t pmap_pdpt[PMAP_NR_RPTPS] __aligned(32);
#endif /* X86_PAE */
/*
* Physical address of the page table root, used during bootstrap.
*/
static pmap_pte_t *pmap_boot_root_pt __initdata;
/*
* Physical address of the kernel page table root.
*/
static phys_addr_t pmap_kroot_pt;
/*
* Maximum mappable kernel address.
*/
static unsigned long pmap_kernel_limit;
/*
* Table of page translation properties.
*
* This table is only used before paging is enabled.
*/
static struct pmap_pt_level pmap_boot_pt_levels[] __initdata = {
{ PMAP_L1_BITS, PMAP_L1_SHIFT, PMAP_PTEMAP_BASE, PMAP_L1_MASK },
{ PMAP_L2_BITS, PMAP_L2_SHIFT, PMAP_L2_PTEMAP, PMAP_L2_MASK },
#if PMAP_NR_LEVELS > 2
{ PMAP_L3_BITS, PMAP_L3_SHIFT, PMAP_L3_PTEMAP, PMAP_L3_MASK },
#if PMAP_NR_LEVELS > 3
{ PMAP_L4_BITS, PMAP_L4_SHIFT, PMAP_L4_PTEMAP, PMAP_L4_MASK }
#endif /* PMAP_NR_LEVELS > 3 */
#endif /* PMAP_NR_LEVELS > 2 */
};
/*
* Reserved pages of virtual memory available for early allocation.
*/
static unsigned long pmap_boot_heap __initdata;
static unsigned long pmap_boot_heap_end __initdata;
/*
* Table of page translation properties.
*
* Located at high virtual addresses, it is filled during initialization from
* the content of its bootstrap version.
*/
static struct pmap_pt_level pmap_pt_levels[ARRAY_SIZE(pmap_boot_pt_levels)];
/*
* Table used to convert machine-independent protection flags to
* machine-dependent PTE bits.
*/
static pmap_pte_t pmap_prot_table[8];
/*
* Special addresses for temporary mappings.
*/
static unsigned long pmap_zero_va;
/*
* True if running on multiple processors (TLB flushes must be propagated).
*/
static volatile int pmap_mp_mode;
/*
* Shared variables used by the inter-processor update functions.
*/
static unsigned long pmap_update_start;
static unsigned long pmap_update_end;
/*
* There is strong bouncing on this counter so give it its own cache line.
*/
static struct {
volatile unsigned long count __aligned(CPU_L1_SIZE);
} pmap_nr_updates;
static void __init
pmap_boot_enter(pmap_pte_t *root_pt, unsigned long va, phys_addr_t pa)
{
const struct pmap_pt_level *pt_level;
unsigned int level, index;
pmap_pte_t *pt, *ptp, *pte;
if (pa != (pa & PMAP_PA_MASK))
boot_panic("pmap: invalid physical address");
pt = root_pt;
for (level = PMAP_NR_LEVELS; level > 1; level--) {
pt_level = &pmap_boot_pt_levels[level - 1];
index = (va >> pt_level->shift) & ((1 << pt_level->bits) - 1);
pte = &pt[index];
if (*pte & PMAP_PTE_P)
ptp = (void *)(unsigned long)(*pte & PMAP_PA_MASK);
else {
ptp = biosmem_bootalloc(1);
*pte = ((unsigned long)ptp | PMAP_PTE_RW | PMAP_PTE_P)
& pt_level->mask;
}
pt = ptp;
}
/*
* As a special case, a null physical address allocates the page tables
* but doesn't create a mapping.
*/
if (pa == 0)
return;
pte = &pt[(va >> PMAP_L1_SHIFT) & ((1 << PMAP_L1_BITS) - 1)];
*pte = (pa & PMAP_PA_MASK) | PMAP_PTE_RW | PMAP_PTE_P;
}
static void __init
pmap_setup_ptemap(pmap_pte_t *root_pt)
{
const struct pmap_pt_level *pt_level;
phys_addr_t pa;
unsigned long va;
unsigned int i, index;
pt_level = &pmap_boot_pt_levels[PMAP_NR_LEVELS - 1];
for (i = 0; i < PMAP_NR_RPTPS; i++) {
va = VM_PMAP_PTEMAP_ADDRESS + (i * (1 << pt_level->shift));
index = (va >> pt_level->shift) & ((1 << pt_level->bits) - 1);
pa = (unsigned long)root_pt + (i * PAGE_SIZE);
root_pt[index] = (pa | PMAP_PTE_RW | PMAP_PTE_P) & pt_level->mask;
}
}
pmap_pte_t * __init
pmap_setup_paging(void)
{
pmap_pte_t *root_pt;
unsigned long va;
phys_addr_t pa;
size_t i, size;
/*
* Create the kernel mappings. The first two are for the .init section and
* the persistent kernel code and data at high addresses respectively. The
* .init section mapping also acts as the mandatory identity mapping.
* The third is the recursive mapping of PTEs.
*
* Any page table required for the virtual addresses that are reserved by
* this module is also allocated.
*/
root_pt = biosmem_bootalloc(PMAP_NR_RPTPS);
va = vm_page_trunc((unsigned long)&_init);
pa = va;
size = vm_page_round((unsigned long)&_einit) - va;
for (i = 0; i < size; i += PAGE_SIZE) {
pmap_boot_enter(root_pt, va, pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
va = vm_page_trunc((unsigned long)&_text);
pa = BOOT_VTOP(va);
size = vm_page_round((unsigned long)&_end) - va;
for (i = 0; i < size; i += PAGE_SIZE) {
pmap_boot_enter(root_pt, va, pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
for (i = 0; i < PMAP_RESERVED_PAGES; i++) {
pmap_boot_enter(root_pt, va, 0);
va += PAGE_SIZE;
}
pmap_setup_ptemap(root_pt);
#ifdef X86_PAE
for (i = 0; i < PMAP_NR_RPTPS; i++)
pmap_boot_pdpt[i] = ((unsigned long)root_pt + (i * PAGE_SIZE))
| PMAP_PTE_P;
pmap_boot_root_pt = pmap_boot_pdpt;
cpu_enable_pae();
#else /* X86_PAE */
pmap_boot_root_pt = root_pt;
#endif /* X86_PAE */
return pmap_boot_root_pt;
}
pmap_pte_t * __init
pmap_ap_setup_paging(void)
{
#ifdef X86_PAE
cpu_enable_pae();
#endif /* X86_PAE */
return pmap_boot_root_pt;
}
static void __init
pmap_setup_global_pages(void)
{
const struct pmap_pt_level *pt_level;
unsigned long va;
unsigned int level;
pmap_pte_t *pte;
va = VM_MAX_KERNEL_ADDRESS;
while (va >= VM_MAX_KERNEL_ADDRESS) {
for (level = PMAP_NR_LEVELS; level > 0; level--) {
pt_level = &pmap_pt_levels[level - 1];
pte = &pt_level->ptes[PMAP_PTEMAP_INDEX(va, pt_level->shift)];
if (!(*pte & PMAP_PTE_P)) {
pte = NULL;
va = P2END(va, 1UL << pt_level->shift);
break;
}
}
if (pte == NULL)
continue;
*pte |= PMAP_PTE_G;
va += PAGE_SIZE;
}
pmap_pt_levels[0].mask |= PMAP_PTE_G;
cpu_enable_global_pages();
}
void __init
pmap_bootstrap(void)
{
memcpy(pmap_pt_levels, pmap_boot_pt_levels, sizeof(pmap_pt_levels));
#ifdef X86_PAE
memcpy(pmap_pdpt, pmap_boot_pdpt, sizeof(pmap_pdpt));
pmap_boot_root_pt = (void *)BOOT_VTOP((unsigned long)pmap_pdpt);
pmap_kroot_pt = (unsigned long)pmap_boot_root_pt;
cpu_set_cr3(pmap_kroot_pt);
#else /* X86_PAE */
pmap_kroot_pt = (unsigned long)pmap_boot_root_pt;
#endif /* X86_PAE */
pmap_prot_table[VM_PROT_NONE] = 0;
pmap_prot_table[VM_PROT_READ] = 0;
pmap_prot_table[VM_PROT_WRITE] = PMAP_PTE_RW;
pmap_prot_table[VM_PROT_WRITE | VM_PROT_READ] = PMAP_PTE_RW;
pmap_prot_table[VM_PROT_EXECUTE] = 0;
pmap_prot_table[VM_PROT_EXECUTE | VM_PROT_READ] = 0;
pmap_prot_table[VM_PROT_EXECUTE | VM_PROT_WRITE] = PMAP_PTE_RW;
pmap_prot_table[VM_PROT_ALL] = PMAP_PTE_RW;
pmap_boot_heap = (unsigned long)&_end;
pmap_boot_heap_end = pmap_boot_heap + (PMAP_RESERVED_PAGES * PAGE_SIZE);
pmap_zero_va = pmap_bootalloc(1);
pmap_kprotect((unsigned long)&_text, (unsigned long)&_rodata,
VM_PROT_READ | VM_PROT_EXECUTE);
pmap_kprotect((unsigned long)&_rodata, (unsigned long)&_data, VM_PROT_READ);
if (cpu_has_global_pages())
pmap_setup_global_pages();
cpu_tlb_flush();
pmap_kernel_limit = VM_MIN_KERNEL_ADDRESS;
}
void __init
pmap_ap_bootstrap(void)
{
if (cpu_has_global_pages())
cpu_enable_global_pages();
while (!pmap_mp_mode)
cpu_pause();
}
unsigned long __init
pmap_bootalloc(unsigned int nr_pages)
{
unsigned long page;
size_t size;
assert(nr_pages > 0);
size = nr_pages * PAGE_SIZE;
assert((pmap_boot_heap + size) > pmap_boot_heap);
assert((pmap_boot_heap + size) <= pmap_boot_heap_end);
page = pmap_boot_heap;
pmap_boot_heap += size;
return page;
}
unsigned long
pmap_klimit(void)
{
return pmap_kernel_limit;
}
static void
pmap_zero_page(phys_addr_t pa)
{
pmap_kenter(pmap_zero_va, pa);
cpu_tlb_flush_va(pmap_zero_va);
memset((void *)pmap_zero_va, 0, PAGE_SIZE);
pmap_kremove(pmap_zero_va, pmap_zero_va + PAGE_SIZE);
cpu_tlb_flush_va(pmap_zero_va);
}
void
pmap_growkernel(unsigned long va)
{
const struct pmap_pt_level *pt_level;
struct vm_page *page;
unsigned long start;
unsigned int level, i, i_start, i_va;
pmap_pte_t *pte;
phys_addr_t pa;
start = pmap_kernel_limit;
va = P2END(va, 1 << PMAP_L2_SHIFT) - 1;
assert(start < va);
for (level = PMAP_NR_LEVELS; level > 1; level--) {
pt_level = &pmap_pt_levels[level - 1];
i_start = PMAP_PTEMAP_INDEX(start, pt_level->shift);
i_va = PMAP_PTEMAP_INDEX(va, pt_level->shift);
for (i = i_start; i <= i_va; i++) {
pte = &pt_level->ptes[i];
if (!(*pte & PMAP_PTE_P)) {
if (!vm_phys_ready)
pa = vm_phys_bootalloc();
else {
page = vm_phys_alloc(0);
if (page == NULL)
panic("pmap: no page available to grow kernel space");
pa = vm_page_to_pa(page);
}
pmap_zero_page(pa);
*pte = (pa | PMAP_PTE_G | PMAP_PTE_RW | PMAP_PTE_P)
& pt_level->mask;
}
}
}
pmap_kernel_limit = va + 1;
}
void
pmap_kenter(unsigned long va, phys_addr_t pa)
{
pmap_pte_t *pte;
pte = PMAP_PTEMAP_BASE + PMAP_PTEMAP_INDEX(va, PMAP_L1_SHIFT);
*pte = ((pa & PMAP_PA_MASK) | PMAP_PTE_G | PMAP_PTE_RW | PMAP_PTE_P)
& pmap_pt_levels[0].mask;
}
void
pmap_kremove(unsigned long start, unsigned long end)
{
pmap_pte_t *pte;
while (start < end) {
pte = PMAP_PTEMAP_BASE + PMAP_PTEMAP_INDEX(start, PMAP_L1_SHIFT);
*pte = 0;
start += PAGE_SIZE;
}
}
void
pmap_kprotect(unsigned long start, unsigned long end, int prot)
{
pmap_pte_t *pte, flags;
flags = pmap_prot_table[prot & VM_PROT_ALL];
while (start < end) {
pte = PMAP_PTEMAP_BASE + PMAP_PTEMAP_INDEX(start, PMAP_L1_SHIFT);
*pte = (*pte & ~PMAP_PTE_PROT_MASK) | flags;
start += PAGE_SIZE;
}
}
static void
pmap_kupdate_local(unsigned long start, unsigned long end)
{
while (start < end) {
cpu_tlb_flush_va(start);
start += PAGE_SIZE;
}
}
void
pmap_kupdate(unsigned long start, unsigned long end)
{
unsigned int nr_cpus;
if (pmap_mp_mode)
nr_cpus = cpu_count();
else
nr_cpus = 1;
if (nr_cpus == 1) {
pmap_kupdate_local(start, end);
return;
}
pmap_update_start = start;
pmap_update_end = end;
pmap_nr_updates.count = nr_cpus - 1;
mb_store();
lapic_ipi_broadcast(TRAP_PMAP_UPDATE);
/*
* Perform the local update now so that some time is given to the other
* processors, which slightly reduces contention on the update counter.
*/
pmap_kupdate_local(start, end);
while (pmap_nr_updates.count != 0)
cpu_pause();
}
phys_addr_t
pmap_kextract(unsigned long va)
{
const struct pmap_pt_level *pt_level;
unsigned int level;
pmap_pte_t *pte;
for (level = PMAP_NR_LEVELS; level > 0; level--) {
pt_level = &pmap_pt_levels[level - 1];
pte = &pt_level->ptes[PMAP_PTEMAP_INDEX(va, pt_level->shift)];
if (!(*pte & PMAP_PTE_P))
return 0;
}
return *pte & PMAP_PA_MASK;
}
void
pmap_mp_setup(void)
{
assert(cpu_intr_enabled());
pmap_mp_mode = 1;
}
void
pmap_update_intr(struct trap_frame *frame)
{
(void)frame;
lapic_eoi();
/* Interrupts are serializing events, no memory barrier required */
pmap_kupdate_local(pmap_update_start, pmap_update_end);
atomic_add(&pmap_nr_updates.count, -1);
}