1 files changed, 754 insertions, 0 deletions
diff --git a/phys.c b/phys.c
new file mode 100644
index 0000000..5f36e14
--- /dev/null
+++ b/phys.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2011 Richard Braun.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Page allocator.
+ *
+ * This implementation uses the binary buddy system to manage its heap.
+ * Descriptions of the buddy system can be found in the following works :
+ * - "UNIX Internals: The New Frontiers", by Uresh Vahalia.
+ * - "Dynamic Storage Allocation: A Survey and Critical Review",
+ *    by Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles.
+ *
+ * In addition, this allocator uses per-cpu pools of pages for level 0
+ * (i.e. single page) allocations. These pools act as caches (but are named
+ * differently to avoid confusion with CPU caches) that reduce contention on
+ * multiprocessor systems. When a pool is empty and cannot provide a page,
+ * it is filled by transferring multiple pages from the backend buddy system.
+ * The symmetric case is handled likewise.
+ */
+
+#include <sched.h>
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+
+#include "cpu.h"
+#include "list.h"
+#include "phys.h"
+#include "error.h"
+#include "macros.h"
+
+/*
+ * The system page size.
+ *
+ * This macro actually expands to a global variable that is set on
+ * initialization.
+ */
+#define PAGE_SIZE ((unsigned long)_pagesize)
+
+/*
+ * Maximum number of segments.
+ */
+#define PHYS_MAX_SEGMENTS 2
+
+/*
+ * Segment boundaries (in page frames).
+ */
+#define PHYS_ISADMA_LIMIT   0x1000
+#define PHYS_NORMAL_LIMIT   0x30000
+
+/*
+ * Number of segment lists.
+ */
+#define PHYS_NR_SEG_LISTS 2
+
+/*
+ * Segment list priorities.
+ *
+ * Higher priorities have lower numerical values.
+ */
+#define PHYS_SEGLIST_NORMAL 1
+#define PHYS_SEGLIST_ISADMA 0
+
+/*
+ * Number of free block lists per segment.
+ */
+#define PHYS_NR_FREE_LISTS 11
+
+/*
+ * The size of a CPU pool is computed by dividing the size of its containing
+ * segment by this value.
+ */
+#define PHYS_CPU_POOL_RATIO 1024
+
+/*
+ * Maximum number of pages in a CPU pool.
+ */
+#define PHYS_CPU_POOL_MAX_SIZE 128
+
+/*
+ * The transfer size of a CPU pool is computed by dividing the pool size by
+ * this value.
+ */
+#define PHYS_CPU_POOL_TRANSFER_RATIO 2
+
+/*
+ * Per-processor cache of pages.
+ */
+struct phys_cpu_pool {
+    pthread_mutex_t lock;
+    int size;
+    int transfer_size;
+    int nr_pages;
+    struct list pages;
+} __aligned(CPU_L1_SIZE);
+
+/*
+ * Special level value.
+ *
+ * When a page is free, its level is the index of its free list.
+ */
+#define PHYS_LEVEL_ALLOCATED PHYS_NR_FREE_LISTS
+
+/*
+ * Doubly-linked list of free blocks.
+ */
+struct phys_free_list {
+    unsigned long size;
+    struct list blocks;
+};
+
+/*
+ * Segment name buffer size.
+ */
+#define PHYS_NAME_SIZE 16
+
+/*
+ * Segment of contiguous memory.
+ */
+struct phys_seg {
+    struct phys_cpu_pool cpu_pools[NR_CPUS];
+
+    struct list node;
+    phys_pfn_t start;
+    phys_pfn_t end;
+    struct phys_page *pages;
+    struct phys_page *pages_end;
+    pthread_mutex_t lock;
+    struct phys_free_list free_lists[PHYS_NR_FREE_LISTS];
+    unsigned long nr_free_pages;
+    char name[PHYS_NAME_SIZE];
+};
+
+/*
+ * See PAGE_SIZE.
+ */
+static long _pagesize;
+
+/*
+ * Segment lists, ordered by priority (higher priority lists have lower
+ * numerical priorities).
+ */
+static struct list phys_seg_lists[PHYS_NR_SEG_LISTS];
+
+/*
+ * Segment table.
+ */
+static struct phys_seg phys_segs[PHYS_MAX_SEGMENTS];
+
+/*
+ * Number of loaded segments.
+ */
+static unsigned int phys_segs_size;
+
+static void phys_page_init(struct phys_page *page, struct phys_seg *seg,
+                           phys_pfn_t pfn)
+{
+    page->seg = seg;
+    page->pfn = pfn;
+    page->level = PHYS_LEVEL_ALLOCATED;
+}
+
+static inline struct phys_page * phys_page_lookup(phys_pfn_t pfn)
+{
+    struct phys_seg *seg;
+    unsigned int i;
+
+    for (i = 0; i < phys_segs_size; i++) {
+        seg = &phys_segs[i];
+
+        if ((pfn >= seg->start) && (pfn < seg->end))
+            return &seg->pages[pfn - seg->start];
+    }
+
+    return NULL;
+}
+
+static void phys_free_list_init(struct phys_free_list *free_list)
+{
+    free_list->size = 0;
+    list_init(&free_list->blocks);
+}
+
+static inline void phys_free_list_insert(struct phys_free_list *free_list,
+                                         struct phys_page *page)
+{
+    assert(page->level == PHYS_LEVEL_ALLOCATED);
+
+    free_list->size++;
+    list_insert(&free_list->blocks, &page->node);
+}
+
+static inline void phys_free_list_remove(struct phys_free_list *free_list,
+                                         struct phys_page *page)
+{
+    assert(free_list->size != 0);
+    assert(!list_empty(&free_list->blocks));
+    assert(page->level < PHYS_NR_FREE_LISTS);
+
+    free_list->size--;
+    list_remove(&page->node);
+}
+
+static struct phys_page * phys_seg_alloc_from_buddy(struct phys_seg *seg,
+                                                    unsigned int level)
+{
+    struct phys_free_list *free_list;
+    struct phys_page *page, *buddy;
+    unsigned int i;
+
+    assert(level < PHYS_NR_FREE_LISTS);
+
+    for (i = level; i < PHYS_NR_FREE_LISTS; i++) {
+        free_list = &seg->free_lists[i];
+
+        if (free_list->size != 0)
+            break;
+    }
+
+    if (i == PHYS_NR_FREE_LISTS)
+        return NULL;
+
+    page = list_first_entry(&free_list->blocks, struct phys_page, node);
+    phys_free_list_remove(free_list, page);
+    page->level = PHYS_LEVEL_ALLOCATED;
+
+    while (i > level) {
+        i--;
+        buddy = &page[1 << i];
+        phys_free_list_insert(&seg->free_lists[i], buddy);
+        buddy->level = i;
+    }
+
+    seg->nr_free_pages -= (1 << level);
+    return page;
+}
+
+static void phys_seg_free_to_buddy(struct phys_seg *seg,
+                                   struct phys_page *page,
+                                   unsigned int level)
+{
+    struct phys_page *buddy;
+    phys_pfn_t size, pfn, buddy_pfn;
+
+    assert(page >= seg->pages);
+    assert(page < seg->pages_end);
+    assert(page->level == PHYS_LEVEL_ALLOCATED);
+    assert(level < PHYS_NR_FREE_LISTS);
+
+    size = (1 << level);
+    pfn = page->pfn;
+
+    while (level < (PHYS_NR_FREE_LISTS - 1)) {
+        buddy_pfn = pfn ^ (1 << level);
+
+        if ((buddy_pfn < seg->start) || (buddy_pfn >= seg->end))
+            break;
+
+        buddy = &seg->pages[buddy_pfn - seg->start];
+
+        if (buddy->level != level)
+            break;
+
+        phys_free_list_remove(&seg->free_lists[level], buddy);
+        buddy->level = PHYS_LEVEL_ALLOCATED;
+        level++;
+        pfn &= -(1 << level);
+        page = &seg->pages[pfn - seg->start];
+    }
+
+    phys_free_list_insert(&seg->free_lists[level], page);
+    page->level = level;
+    seg->nr_free_pages += size;
+}
+
+static void phys_cpu_pool_init(struct phys_cpu_pool *cpu_pool,
+                               phys_pfn_t size)
+{
+    cpu_pool->size = size;
+    cpu_pool->transfer_size = (size + PHYS_CPU_POOL_TRANSFER_RATIO - 1)
+                              / PHYS_CPU_POOL_TRANSFER_RATIO;
+    cpu_pool->nr_pages = 0;
+    list_init(&cpu_pool->pages);
+}
+
+/*
+ * Return a CPU pool.
+ *
+ * This function will generally return the pool matching the CPU running the
+ * calling thread. Because of context switches and thread migration, the
+ * caller might be running on another processor after this function returns.
+ * Although not optimal, this should rarely happen, and it doesn't affect the
+ * allocator operations in any other way, as CPU pools are always valid, and
+ * their access is serialized by a lock.
+ */
+static inline struct phys_cpu_pool * phys_cpu_pool_get(struct phys_seg *seg)
+{
+    return &seg->cpu_pools[cpu_id()];
+}
+
+static inline struct phys_page *
+phys_cpu_pool_pop(struct phys_cpu_pool *cpu_pool)
+{
+    struct phys_page *page;
+
+    assert(cpu_pool->nr_pages != 0);
+    cpu_pool->nr_pages--;
+    page = list_first_entry(&cpu_pool->pages, struct phys_page, node);
+    list_remove(&page->node);
+    return page;
+}
+
+static inline void phys_cpu_pool_push(struct phys_cpu_pool *cpu_pool,
+                                      struct phys_page *page)
+{
+    assert(cpu_pool->nr_pages < cpu_pool->size);
+    cpu_pool->nr_pages++;
+    list_insert(&cpu_pool->pages, &page->node);
+}
+
+static int phys_cpu_pool_fill(struct phys_cpu_pool *cpu_pool,
+                              struct phys_seg *seg)
+{
+    struct phys_page *page;
+    int i;
+
+    assert(cpu_pool->nr_pages == 0);
+
+    pthread_mutex_lock(&seg->lock);
+
+    for (i = 0; i < cpu_pool->transfer_size; i++) {
+        page = phys_seg_alloc_from_buddy(seg, 0);
+
+        if (page == NULL)
+            break;
+
+        phys_cpu_pool_push(cpu_pool, page);
+    }
+
+    pthread_mutex_unlock(&seg->lock);
+
+    return i;
+}
+
+static void phys_cpu_pool_drain(struct phys_cpu_pool *cpu_pool,
+                                struct phys_seg *seg)
+{
+    struct phys_page *page;
+    int i;
+
+    assert(cpu_pool->nr_pages == cpu_pool->size);
+
+    pthread_mutex_lock(&seg->lock);
+
+    for (i = cpu_pool->transfer_size; i > 0; i--) {
+        page = phys_cpu_pool_pop(cpu_pool);
+        phys_seg_free_to_buddy(seg, page, 0);
+    }
+
+    pthread_mutex_unlock(&seg->lock);
+}
+
+static inline phys_pfn_t phys_seg_start(struct phys_seg *seg)
+{
+    return seg->start;
+}
+
+static inline phys_pfn_t phys_seg_end(struct phys_seg *seg)
+{
+    return seg->end;
+}
+
+static inline phys_pfn_t phys_seg_size(struct phys_seg *seg)
+{
+    return phys_seg_end(seg) - phys_seg_start(seg);
+}
+
+static phys_pfn_t phys_seg_compute_pool_size(phys_pfn_t seg_size)
+{
+    phys_pfn_t size;
+
+    size = seg_size / PHYS_CPU_POOL_RATIO;
+
+    if (size == 0)
+        size = 1;
+    else if (size > PHYS_CPU_POOL_MAX_SIZE)
+        size = PHYS_CPU_POOL_MAX_SIZE;
+
+    return size;
+}
+
+static void phys_seg_init(struct phys_seg *seg, struct phys_page *pages)
+{
+    phys_pfn_t seg_size, pool_size;
+    unsigned int i;
+
+    seg_size = phys_seg_size(seg);
+    pool_size = phys_seg_compute_pool_size(seg_size);
+
+    for (i = 0; i < ARRAY_SIZE(seg->cpu_pools); i++)
+        phys_cpu_pool_init(&seg->cpu_pools[i], pool_size);
+
+    seg->pages = pages;
+    seg->pages_end = pages + seg_size;
+    pthread_mutex_init(&seg->lock, NULL);
+
+    for (i = 0; i < ARRAY_SIZE(seg->free_lists); i++)
+        phys_free_list_init(&seg->free_lists[i]);
+
+    seg->nr_free_pages = 0;
+
+    for (i = 0; i < seg_size; i++)
+        phys_page_init(&pages[i], seg, seg->start + i);
+}
+
+/*
+ * Return the level (i.e. the index in the free lists array) matching the
+ * given size.
+ */
+static inline unsigned int phys_get_level(phys_pfn_t size)
+{
+    assert(size != 0);
+
+    size--;
+
+    if (size == 0)
+        return 0;
+    else
+        return (sizeof(size) * CHAR_BIT) - __builtin_clzl(size);
+}
+
+static struct phys_page * phys_seg_alloc(struct phys_seg *seg, phys_pfn_t size)
+{
+    struct phys_cpu_pool *cpu_pool;
+    struct phys_page *page;
+    unsigned int level;
+    int filled;
+
+    level = phys_get_level(size);
+
+    if (level == 0) {
+        cpu_pool = phys_cpu_pool_get(seg);
+
+        pthread_mutex_lock(&cpu_pool->lock);
+
+        if (cpu_pool->nr_pages == 0) {
+            filled = phys_cpu_pool_fill(cpu_pool, seg);
+
+            if (!filled) {
+                pthread_mutex_unlock(&cpu_pool->lock);
+                return NULL;
+            }
+        }
+
+        page = phys_cpu_pool_pop(cpu_pool);
+        pthread_mutex_unlock(&cpu_pool->lock);
+    } else {
+        pthread_mutex_lock(&seg->lock);
+        page = phys_seg_alloc_from_buddy(seg, level);
+        pthread_mutex_unlock(&seg->lock);
+    }
+
+    return page;
+}
+
+static void phys_seg_free(struct phys_seg *seg, struct phys_page *page,
+                          phys_pfn_t size)
+{
+    struct phys_cpu_pool *cpu_pool;
+    unsigned int level;
+
+    level = phys_get_level(size);
+
+    if (level == 0) {
+        cpu_pool = phys_cpu_pool_get(seg);
+
+        pthread_mutex_lock(&cpu_pool->lock);
+
+        if (cpu_pool->nr_pages == cpu_pool->size)
+            phys_cpu_pool_drain(cpu_pool, seg);
+
+        phys_cpu_pool_push(cpu_pool, page);
+        pthread_mutex_unlock(&cpu_pool->lock);
+    } else {
+        pthread_mutex_lock(&seg->lock);
+        phys_seg_free_to_buddy(seg, page, level);
+        pthread_mutex_unlock(&seg->lock);
+    }
+}
+
+/*
+ * Load memory during initialization.
+ *
+ * This function partially initializes a segment.
+ */
+static void phys_load_segment(const char *name, phys_pfn_t start,
+                              phys_pfn_t end, unsigned int seg_list_prio)
+{
+    static int initialized = 0;
+    struct phys_seg *seg;
+    struct list *seg_list;
+    unsigned int i;
+
+    assert(name != NULL);
+    assert(start < end);
+    assert(seg_list_prio < ARRAY_SIZE(phys_seg_lists));
+
+    if (!initialized) {
+        for (i = 0; i < ARRAY_SIZE(phys_seg_lists); i++)
+            list_init(&phys_seg_lists[i]);
+
+        phys_segs_size = 0;
+        initialized = 1;
+    }
+
+    if (phys_segs_size >= ARRAY_SIZE(phys_segs))
+        error_die(ERR_NORES);
+
+    seg_list = &phys_seg_lists[seg_list_prio];
+    seg = &phys_segs[phys_segs_size];
+
+    list_insert_tail(seg_list, &seg->node);
+    seg->start = start;
+    seg->end = end;
+    strncpy(seg->name, name, PHYS_NAME_SIZE);
+    seg->name[sizeof(seg->name) - 1] = '\0';
+
+    phys_segs_size++;
+}
+
+/*
+ * Loading segments is normally done by architecture-specific code. In
+ * this implementation, an Intel machine with two segments of RAM is
+ * virtualized.
+ */
+static void phys_load_segments(void)
+{
+    phys_pfn_t start, end;
+    size_t size;
+    void *addr;
+
+    /*
+     * Load the ISADMA segment.
+     */
+    size = PHYS_ISADMA_LIMIT * PAGE_SIZE;
+    addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+    if (addr == MAP_FAILED)
+        error_die(ERR_NOMEM);
+
+    start = (unsigned long)addr / PAGE_SIZE;
+    end = start + (size / PAGE_SIZE);
+    phys_load_segment("isadma", start, end, PHYS_SEGLIST_ISADMA);
+
+    /*
+     * Load the normal segment.
+     */
+    size = (PHYS_NORMAL_LIMIT - PHYS_ISADMA_LIMIT) * PAGE_SIZE;
+    addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+    if (addr == MAP_FAILED)
+        error_die(ERR_NOMEM);
+
+    start = (unsigned long)addr / PAGE_SIZE;
+    end = start + (size / PAGE_SIZE);
+    phys_load_segment("normal", start, end, PHYS_SEGLIST_NORMAL);
+}
+
+void phys_init(void)
+{
+    struct phys_seg *seg, *map_seg;
+    struct phys_page *page, *map;
+    struct list *seg_list;
+    phys_pfn_t start, map_size;
+    unsigned int i;
+
+    _pagesize = sysconf(_SC_PAGESIZE);
+    assert(ISP2(_pagesize));
+
+    phys_load_segments();
+
+    /*
+     * Compute the memory map size.
+     */
+    map_size = 0;
+
+    for (i = 0; i < phys_segs_size; i++)
+        map_size += phys_seg_size(&phys_segs[i]);
+
+    map_size = P2ROUND(map_size * sizeof(struct phys_page), PAGE_SIZE)
+               / PAGE_SIZE;
+
+    /*
+     * Find a segment from which to allocate the memory map.
+     */
+    for (seg_list = &phys_seg_lists[ARRAY_SIZE(phys_seg_lists) - 1];
+         seg_list >= phys_seg_lists;
+         seg_list--)
+        list_for_each_entry(seg_list, map_seg, node)
+            if (map_size <= phys_seg_size(map_seg))
+                goto found;
+
+    error_die(ERR_NOMEM);
+
+found:
+    /*
+     * Allocate the memory map.
+     */
+    map = (struct phys_page *)(phys_seg_start(map_seg) * PAGE_SIZE);
+
+    /*
+     * Initialize the segments, associating them to the memory map. When
+     * the segments are initialized, all their pages are set allocated,
+     * with a block size of one (level 0). They are then released, which
+     * populates the free lists.
+     */
+    start = 0;
+
+    for (i = 0; i < phys_segs_size; i++) {
+        seg = &phys_segs[i];
+        phys_seg_init(seg, map + start);
+
+        /*
+         * Don't release the memory map pages.
+         *
+         * XXX The memory map pages normally don't need descriptors, as they
+         * are never released. This implementation however can be used in
+         * cases where some memory is reserved at the start of all segments.
+         * In order not to require descriptors for the memory map, the segment
+         * where the map resides should be split. As it is quite cumbersome,
+         * no effort is made here to avoid wasting descriptors and the pages
+         * containing them.
+         */
+        if (seg == map_seg)
+            page = seg->pages + map_size;
+        else
+            page = seg->pages;
+
+        while (page < seg->pages_end) {
+            phys_seg_free_to_buddy(seg, page, 0);
+            page++;
+        }
+
+        start += phys_seg_size(seg);
+    }
+}
+
+struct phys_page * phys_alloc_pages(phys_pfn_t size)
+{
+    struct list *seg_list;
+    struct phys_seg *seg;
+    struct phys_page *page;
+
+    for (seg_list = &phys_seg_lists[ARRAY_SIZE(phys_seg_lists) - 1];
+         seg_list >= phys_seg_lists;
+         seg_list--)
+        list_for_each_entry(seg_list, seg, node) {
+            page = phys_seg_alloc(seg, size);
+
+            if (page != NULL)
+                return page;
+        }
+
+    return NULL;
+}
+
+void phys_free_pages(struct phys_page *page, phys_pfn_t size)
+{
+    phys_seg_free(page->seg, page, size);
+}
+
+phys_pfn_t phys_alloc(phys_pfn_t size)
+{
+    struct phys_page *page;
+
+    page = phys_alloc_pages(size);
+
+    /*
+     * XXX Rely on the system to never provide a virtual memory area
+     * starting at 0.
+     */
+    if (page == NULL)
+        return 0;
+
+    return page->pfn;
+}
+
+void phys_free(phys_pfn_t pfn, phys_pfn_t size)
+{
+    struct phys_page *page;
+
+    page = phys_page_lookup(pfn);
+    assert(page != NULL);
+    phys_free_pages(page, size);
+}
+
+void phys_info(void)
+{
+    struct phys_seg *seg;
+    unsigned int i, j;
+    char name[16];
+
+    printf("    name");
+
+    for (i = 0; i < PHYS_NR_FREE_LISTS; i++) {
+        snprintf(name, sizeof(name), "#%u", (1 << i));
+        printf(" %5s", name);
+    }
+
+    printf("\n");
+
+    for (i = 0; i < phys_segs_size; i++) {
+        seg = &phys_segs[i];
+
+        printf("%8s", seg->name);
+
+        pthread_mutex_lock(&seg->lock);
+
+        for (j = 0; j < ARRAY_SIZE(seg->free_lists); j++)
+            printf(" %5lu", seg->free_lists[j].size);
+
+        pthread_mutex_unlock(&seg->lock);
+
+        printf("\n");
+    }
+}