summaryrefslogtreecommitdiff
path: root/phys.c
diff options
context:
space:
mode:
Diffstat (limited to 'phys.c')
-rw-r--r--phys.c754
1 files changed, 754 insertions, 0 deletions
diff --git a/phys.c b/phys.c
new file mode 100644
index 0000000..5f36e14
--- /dev/null
+++ b/phys.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2011 Richard Braun.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Page allocator.
+ *
+ * This implementation uses the binary buddy system to manage its heap.
+ * Descriptions of the buddy system can be found in the following works :
+ * - "UNIX Internals: The New Frontiers", by Uresh Vahalia.
+ * - "Dynamic Storage Allocation: A Survey and Critical Review",
+ * by Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles.
+ *
+ * In addition, this allocator uses per-cpu pools of pages for level 0
+ * (i.e. single page) allocations. These pools act as caches (but are named
+ * differently to avoid confusion with CPU caches) that reduce contention on
+ * multiprocessor systems. When a pool is empty and cannot provide a page,
+ * it is filled by transferring multiple pages from the backend buddy system.
+ * The symmetric case is handled likewise.
+ */
+
+#include <sched.h>
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+
+#include "cpu.h"
+#include "list.h"
+#include "phys.h"
+#include "error.h"
+#include "macros.h"
+
+/*
+ * The system page size.
+ *
+ * This macro actually expands to a global variable that is set on
+ * initialization.
+ */
+#define PAGE_SIZE ((unsigned long)_pagesize)
+
+/*
+ * Maximum number of segments.
+ */
+#define PHYS_MAX_SEGMENTS 2
+
+/*
+ * Segment boundaries (in page frames).
+ */
+#define PHYS_ISADMA_LIMIT 0x1000
+#define PHYS_NORMAL_LIMIT 0x30000
+
+/*
+ * Number of segment lists.
+ */
+#define PHYS_NR_SEG_LISTS 2
+
+/*
+ * Segment list priorities.
+ *
+ * Higher priorities have lower numerical values.
+ */
+#define PHYS_SEGLIST_NORMAL 1
+#define PHYS_SEGLIST_ISADMA 0
+
+/*
+ * Number of free block lists per segment.
+ */
+#define PHYS_NR_FREE_LISTS 11
+
+/*
+ * The size of a CPU pool is computed by dividing the size of its containing
+ * segment by this value.
+ */
+#define PHYS_CPU_POOL_RATIO 1024
+
+/*
+ * Maximum number of pages in a CPU pool.
+ */
+#define PHYS_CPU_POOL_MAX_SIZE 128
+
+/*
+ * The transfer size of a CPU pool is computed by dividing the pool size by
+ * this value.
+ */
+#define PHYS_CPU_POOL_TRANSFER_RATIO 2
+
+/*
+ * Per-processor cache of pages.
+ */
+struct phys_cpu_pool {
+ pthread_mutex_t lock;
+ int size;
+ int transfer_size;
+ int nr_pages;
+ struct list pages;
+} __aligned(CPU_L1_SIZE);
+
+/*
+ * Special level value.
+ *
+ * When a page is free, its level is the index of its free list.
+ */
+#define PHYS_LEVEL_ALLOCATED PHYS_NR_FREE_LISTS
+
+/*
+ * Doubly-linked list of free blocks.
+ */
+struct phys_free_list {
+ unsigned long size;
+ struct list blocks;
+};
+
+/*
+ * Segment name buffer size.
+ */
+#define PHYS_NAME_SIZE 16
+
+/*
+ * Segment of contiguous memory.
+ */
+struct phys_seg {
+ struct phys_cpu_pool cpu_pools[NR_CPUS];
+
+ struct list node;
+ phys_pfn_t start;
+ phys_pfn_t end;
+ struct phys_page *pages;
+ struct phys_page *pages_end;
+ pthread_mutex_t lock;
+ struct phys_free_list free_lists[PHYS_NR_FREE_LISTS];
+ unsigned long nr_free_pages;
+ char name[PHYS_NAME_SIZE];
+};
+
+/*
+ * See PAGE_SIZE.
+ */
+static long _pagesize;
+
+/*
+ * Segment lists, ordered by priority (higher priority lists have lower
+ * numerical priorities).
+ */
+static struct list phys_seg_lists[PHYS_NR_SEG_LISTS];
+
+/*
+ * Segment table.
+ */
+static struct phys_seg phys_segs[PHYS_MAX_SEGMENTS];
+
+/*
+ * Number of loaded segments.
+ */
+static unsigned int phys_segs_size;
+
+static void phys_page_init(struct phys_page *page, struct phys_seg *seg,
+ phys_pfn_t pfn)
+{
+ page->seg = seg;
+ page->pfn = pfn;
+ page->level = PHYS_LEVEL_ALLOCATED;
+}
+
+static inline struct phys_page * phys_page_lookup(phys_pfn_t pfn)
+{
+ struct phys_seg *seg;
+ unsigned int i;
+
+ for (i = 0; i < phys_segs_size; i++) {
+ seg = &phys_segs[i];
+
+ if ((pfn >= seg->start) && (pfn < seg->end))
+ return &seg->pages[pfn - seg->start];
+ }
+
+ return NULL;
+}
+
+static void phys_free_list_init(struct phys_free_list *free_list)
+{
+ free_list->size = 0;
+ list_init(&free_list->blocks);
+}
+
+static inline void phys_free_list_insert(struct phys_free_list *free_list,
+ struct phys_page *page)
+{
+ assert(page->level == PHYS_LEVEL_ALLOCATED);
+
+ free_list->size++;
+ list_insert(&free_list->blocks, &page->node);
+}
+
+static inline void phys_free_list_remove(struct phys_free_list *free_list,
+ struct phys_page *page)
+{
+ assert(free_list->size != 0);
+ assert(!list_empty(&free_list->blocks));
+ assert(page->level < PHYS_NR_FREE_LISTS);
+
+ free_list->size--;
+ list_remove(&page->node);
+}
+
+static struct phys_page * phys_seg_alloc_from_buddy(struct phys_seg *seg,
+ unsigned int level)
+{
+ struct phys_free_list *free_list;
+ struct phys_page *page, *buddy;
+ unsigned int i;
+
+ assert(level < PHYS_NR_FREE_LISTS);
+
+ for (i = level; i < PHYS_NR_FREE_LISTS; i++) {
+ free_list = &seg->free_lists[i];
+
+ if (free_list->size != 0)
+ break;
+ }
+
+ if (i == PHYS_NR_FREE_LISTS)
+ return NULL;
+
+ page = list_first_entry(&free_list->blocks, struct phys_page, node);
+ phys_free_list_remove(free_list, page);
+ page->level = PHYS_LEVEL_ALLOCATED;
+
+ while (i > level) {
+ i--;
+ buddy = &page[1 << i];
+ phys_free_list_insert(&seg->free_lists[i], buddy);
+ buddy->level = i;
+ }
+
+ seg->nr_free_pages -= (1 << level);
+ return page;
+}
+
+static void phys_seg_free_to_buddy(struct phys_seg *seg,
+ struct phys_page *page,
+ unsigned int level)
+{
+ struct phys_page *buddy;
+ phys_pfn_t size, pfn, buddy_pfn;
+
+ assert(page >= seg->pages);
+ assert(page < seg->pages_end);
+ assert(page->level == PHYS_LEVEL_ALLOCATED);
+ assert(level < PHYS_NR_FREE_LISTS);
+
+ size = (1 << level);
+ pfn = page->pfn;
+
+ while (level < (PHYS_NR_FREE_LISTS - 1)) {
+ buddy_pfn = pfn ^ (1 << level);
+
+ if ((buddy_pfn < seg->start) || (buddy_pfn >= seg->end))
+ break;
+
+ buddy = &seg->pages[buddy_pfn - seg->start];
+
+ if (buddy->level != level)
+ break;
+
+ phys_free_list_remove(&seg->free_lists[level], buddy);
+ buddy->level = PHYS_LEVEL_ALLOCATED;
+ level++;
+ pfn &= -(1 << level);
+ page = &seg->pages[pfn - seg->start];
+ }
+
+ phys_free_list_insert(&seg->free_lists[level], page);
+ page->level = level;
+ seg->nr_free_pages += size;
+}
+
+static void phys_cpu_pool_init(struct phys_cpu_pool *cpu_pool,
+ phys_pfn_t size)
+{
+ cpu_pool->size = size;
+ cpu_pool->transfer_size = (size + PHYS_CPU_POOL_TRANSFER_RATIO - 1)
+ / PHYS_CPU_POOL_TRANSFER_RATIO;
+ cpu_pool->nr_pages = 0;
+ list_init(&cpu_pool->pages);
+}
+
+/*
+ * Return a CPU pool.
+ *
+ * This function will generally return the pool matching the CPU running the
+ * calling thread. Because of context switches and thread migration, the
+ * caller might be running on another processor after this function returns.
+ * Although not optimal, this should rarely happen, and it doesn't affect the
+ * allocator operations in any other way, as CPU pools are always valid, and
+ * their access is serialized by a lock.
+ */
+static inline struct phys_cpu_pool * phys_cpu_pool_get(struct phys_seg *seg)
+{
+ return &seg->cpu_pools[cpu_id()];
+}
+
+static inline struct phys_page *
+phys_cpu_pool_pop(struct phys_cpu_pool *cpu_pool)
+{
+ struct phys_page *page;
+
+ assert(cpu_pool->nr_pages != 0);
+ cpu_pool->nr_pages--;
+ page = list_first_entry(&cpu_pool->pages, struct phys_page, node);
+ list_remove(&page->node);
+ return page;
+}
+
+static inline void phys_cpu_pool_push(struct phys_cpu_pool *cpu_pool,
+ struct phys_page *page)
+{
+ assert(cpu_pool->nr_pages < cpu_pool->size);
+ cpu_pool->nr_pages++;
+ list_insert(&cpu_pool->pages, &page->node);
+}
+
+static int phys_cpu_pool_fill(struct phys_cpu_pool *cpu_pool,
+ struct phys_seg *seg)
+{
+ struct phys_page *page;
+ int i;
+
+ assert(cpu_pool->nr_pages == 0);
+
+ pthread_mutex_lock(&seg->lock);
+
+ for (i = 0; i < cpu_pool->transfer_size; i++) {
+ page = phys_seg_alloc_from_buddy(seg, 0);
+
+ if (page == NULL)
+ break;
+
+ phys_cpu_pool_push(cpu_pool, page);
+ }
+
+ pthread_mutex_unlock(&seg->lock);
+
+ return i;
+}
+
+static void phys_cpu_pool_drain(struct phys_cpu_pool *cpu_pool,
+ struct phys_seg *seg)
+{
+ struct phys_page *page;
+ int i;
+
+ assert(cpu_pool->nr_pages == cpu_pool->size);
+
+ pthread_mutex_lock(&seg->lock);
+
+ for (i = cpu_pool->transfer_size; i > 0; i--) {
+ page = phys_cpu_pool_pop(cpu_pool);
+ phys_seg_free_to_buddy(seg, page, 0);
+ }
+
+ pthread_mutex_unlock(&seg->lock);
+}
+
+static inline phys_pfn_t phys_seg_start(struct phys_seg *seg)
+{
+ return seg->start;
+}
+
+static inline phys_pfn_t phys_seg_end(struct phys_seg *seg)
+{
+ return seg->end;
+}
+
+static inline phys_pfn_t phys_seg_size(struct phys_seg *seg)
+{
+ return phys_seg_end(seg) - phys_seg_start(seg);
+}
+
+static phys_pfn_t phys_seg_compute_pool_size(phys_pfn_t seg_size)
+{
+ phys_pfn_t size;
+
+ size = seg_size / PHYS_CPU_POOL_RATIO;
+
+ if (size == 0)
+ size = 1;
+ else if (size > PHYS_CPU_POOL_MAX_SIZE)
+ size = PHYS_CPU_POOL_MAX_SIZE;
+
+ return size;
+}
+
+static void phys_seg_init(struct phys_seg *seg, struct phys_page *pages)
+{
+ phys_pfn_t seg_size, pool_size;
+ unsigned int i;
+
+ seg_size = phys_seg_size(seg);
+ pool_size = phys_seg_compute_pool_size(seg_size);
+
+ for (i = 0; i < ARRAY_SIZE(seg->cpu_pools); i++)
+ phys_cpu_pool_init(&seg->cpu_pools[i], pool_size);
+
+ seg->pages = pages;
+ seg->pages_end = pages + seg_size;
+ pthread_mutex_init(&seg->lock, NULL);
+
+ for (i = 0; i < ARRAY_SIZE(seg->free_lists); i++)
+ phys_free_list_init(&seg->free_lists[i]);
+
+ seg->nr_free_pages = 0;
+
+ for (i = 0; i < seg_size; i++)
+ phys_page_init(&pages[i], seg, seg->start + i);
+}
+
+/*
+ * Return the level (i.e. the index in the free lists array) matching the
+ * given size.
+ */
+static inline unsigned int phys_get_level(phys_pfn_t size)
+{
+ assert(size != 0);
+
+ size--;
+
+ if (size == 0)
+ return 0;
+ else
+ return (sizeof(size) * CHAR_BIT) - __builtin_clzl(size);
+}
+
+static struct phys_page * phys_seg_alloc(struct phys_seg *seg, phys_pfn_t size)
+{
+ struct phys_cpu_pool *cpu_pool;
+ struct phys_page *page;
+ unsigned int level;
+ int filled;
+
+ level = phys_get_level(size);
+
+ if (level == 0) {
+ cpu_pool = phys_cpu_pool_get(seg);
+
+ pthread_mutex_lock(&cpu_pool->lock);
+
+ if (cpu_pool->nr_pages == 0) {
+ filled = phys_cpu_pool_fill(cpu_pool, seg);
+
+ if (!filled) {
+ pthread_mutex_unlock(&cpu_pool->lock);
+ return NULL;
+ }
+ }
+
+ page = phys_cpu_pool_pop(cpu_pool);
+ pthread_mutex_unlock(&cpu_pool->lock);
+ } else {
+ pthread_mutex_lock(&seg->lock);
+ page = phys_seg_alloc_from_buddy(seg, level);
+ pthread_mutex_unlock(&seg->lock);
+ }
+
+ return page;
+}
+
+static void phys_seg_free(struct phys_seg *seg, struct phys_page *page,
+ phys_pfn_t size)
+{
+ struct phys_cpu_pool *cpu_pool;
+ unsigned int level;
+
+ level = phys_get_level(size);
+
+ if (level == 0) {
+ cpu_pool = phys_cpu_pool_get(seg);
+
+ pthread_mutex_lock(&cpu_pool->lock);
+
+ if (cpu_pool->nr_pages == cpu_pool->size)
+ phys_cpu_pool_drain(cpu_pool, seg);
+
+ phys_cpu_pool_push(cpu_pool, page);
+ pthread_mutex_unlock(&cpu_pool->lock);
+ } else {
+ pthread_mutex_lock(&seg->lock);
+ phys_seg_free_to_buddy(seg, page, level);
+ pthread_mutex_unlock(&seg->lock);
+ }
+}
+
+/*
+ * Load memory during initialization.
+ *
+ * This function partially initializes a segment.
+ */
+static void phys_load_segment(const char *name, phys_pfn_t start,
+ phys_pfn_t end, unsigned int seg_list_prio)
+{
+ static int initialized = 0;
+ struct phys_seg *seg;
+ struct list *seg_list;
+ unsigned int i;
+
+ assert(name != NULL);
+ assert(start < end);
+ assert(seg_list_prio < ARRAY_SIZE(phys_seg_lists));
+
+ if (!initialized) {
+ for (i = 0; i < ARRAY_SIZE(phys_seg_lists); i++)
+ list_init(&phys_seg_lists[i]);
+
+ phys_segs_size = 0;
+ initialized = 1;
+ }
+
+ if (phys_segs_size >= ARRAY_SIZE(phys_segs))
+ error_die(ERR_NORES);
+
+ seg_list = &phys_seg_lists[seg_list_prio];
+ seg = &phys_segs[phys_segs_size];
+
+ list_insert_tail(seg_list, &seg->node);
+ seg->start = start;
+ seg->end = end;
+ strncpy(seg->name, name, PHYS_NAME_SIZE);
+ seg->name[sizeof(seg->name) - 1] = '\0';
+
+ phys_segs_size++;
+}
+
+/*
+ * Loading segments is normally done by architecture-specific code. In
+ * this implementation, an Intel machine with two segments of RAM is
+ * virtualized.
+ */
+static void phys_load_segments(void)
+{
+ phys_pfn_t start, end;
+ size_t size;
+ void *addr;
+
+ /*
+ * Load the ISADMA segment.
+ */
+ size = PHYS_ISADMA_LIMIT * PAGE_SIZE;
+ addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (addr == MAP_FAILED)
+ error_die(ERR_NOMEM);
+
+ start = (unsigned long)addr / PAGE_SIZE;
+ end = start + (size / PAGE_SIZE);
+ phys_load_segment("isadma", start, end, PHYS_SEGLIST_ISADMA);
+
+ /*
+ * Load the normal segment.
+ */
+ size = (PHYS_NORMAL_LIMIT - PHYS_ISADMA_LIMIT) * PAGE_SIZE;
+ addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (addr == MAP_FAILED)
+ error_die(ERR_NOMEM);
+
+ start = (unsigned long)addr / PAGE_SIZE;
+ end = start + (size / PAGE_SIZE);
+ phys_load_segment("normal", start, end, PHYS_SEGLIST_NORMAL);
+}
+
+void phys_init(void)
+{
+ struct phys_seg *seg, *map_seg;
+ struct phys_page *page, *map;
+ struct list *seg_list;
+ phys_pfn_t start, map_size;
+ unsigned int i;
+
+ _pagesize = sysconf(_SC_PAGESIZE);
+ assert(ISP2(_pagesize));
+
+ phys_load_segments();
+
+ /*
+ * Compute the memory map size.
+ */
+ map_size = 0;
+
+ for (i = 0; i < phys_segs_size; i++)
+ map_size += phys_seg_size(&phys_segs[i]);
+
+ map_size = P2ROUND(map_size * sizeof(struct phys_page), PAGE_SIZE)
+ / PAGE_SIZE;
+
+ /*
+ * Find a segment from which to allocate the memory map.
+ */
+ for (seg_list = &phys_seg_lists[ARRAY_SIZE(phys_seg_lists) - 1];
+ seg_list >= phys_seg_lists;
+ seg_list--)
+ list_for_each_entry(seg_list, map_seg, node)
+ if (map_size <= phys_seg_size(map_seg))
+ goto found;
+
+ error_die(ERR_NOMEM);
+
+found:
+ /*
+ * Allocate the memory map.
+ */
+ map = (struct phys_page *)(phys_seg_start(map_seg) * PAGE_SIZE);
+
+ /*
+ * Initialize the segments, associating them to the memory map. When
+ * the segments are initialized, all their pages are set allocated,
+ * with a block size of one (level 0). They are then released, which
+ * populates the free lists.
+ */
+ start = 0;
+
+ for (i = 0; i < phys_segs_size; i++) {
+ seg = &phys_segs[i];
+ phys_seg_init(seg, map + start);
+
+ /*
+ * Don't release the memory map pages.
+ *
+ * XXX The memory map pages normally don't need descriptors, as they
+ * are never released. This implementation however can be used in
+ * cases where some memory is reserved at the start of all segments.
+ * In order not to require descriptors for the memory map, the segment
+ * where the map resides should be split. As it is quite cumbersome,
+ * no effort is made here to avoid wasting descriptors and the pages
+ * containing them.
+ */
+ if (seg == map_seg)
+ page = seg->pages + map_size;
+ else
+ page = seg->pages;
+
+ while (page < seg->pages_end) {
+ phys_seg_free_to_buddy(seg, page, 0);
+ page++;
+ }
+
+ start += phys_seg_size(seg);
+ }
+}
+
+struct phys_page * phys_alloc_pages(phys_pfn_t size)
+{
+ struct list *seg_list;
+ struct phys_seg *seg;
+ struct phys_page *page;
+
+ for (seg_list = &phys_seg_lists[ARRAY_SIZE(phys_seg_lists) - 1];
+ seg_list >= phys_seg_lists;
+ seg_list--)
+ list_for_each_entry(seg_list, seg, node) {
+ page = phys_seg_alloc(seg, size);
+
+ if (page != NULL)
+ return page;
+ }
+
+ return NULL;
+}
+
+void phys_free_pages(struct phys_page *page, phys_pfn_t size)
+{
+ phys_seg_free(page->seg, page, size);
+}
+
+phys_pfn_t phys_alloc(phys_pfn_t size)
+{
+ struct phys_page *page;
+
+ page = phys_alloc_pages(size);
+
+ /*
+ * XXX Rely on the system to never provide a virtual memory area
+ * starting at 0.
+ */
+ if (page == NULL)
+ return 0;
+
+ return page->pfn;
+}
+
+void phys_free(phys_pfn_t pfn, phys_pfn_t size)
+{
+ struct phys_page *page;
+
+ page = phys_page_lookup(pfn);
+ assert(page != NULL);
+ phys_free_pages(page, size);
+}
+
+void phys_info(void)
+{
+ struct phys_seg *seg;
+ unsigned int i, j;
+ char name[16];
+
+ printf(" name");
+
+ for (i = 0; i < PHYS_NR_FREE_LISTS; i++) {
+ snprintf(name, sizeof(name), "#%u", (1 << i));
+ printf(" %5s", name);
+ }
+
+ printf("\n");
+
+ for (i = 0; i < phys_segs_size; i++) {
+ seg = &phys_segs[i];
+
+ printf("%8s", seg->name);
+
+ pthread_mutex_lock(&seg->lock);
+
+ for (j = 0; j < ARRAY_SIZE(seg->free_lists); j++)
+ printf(" %5lu", seg->free_lists[j].size);
+
+ pthread_mutex_unlock(&seg->lock);
+
+ printf("\n");
+ }
+}