From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make VMAs per MM as for MMU-mode linux Make VMAs per mm_struct as for MMU-mode linux. This solves two problems: (1) In SYSV SHM where nattch for a segment does not reflect the number of shmat's (and forks) done. (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an exec'ing process when VM_EXECUTABLE is specified, regardless of the fact that a VMA might be shared and already have its vm_mm assigned to another process or a dead process. A new struct (vm_region) is introduced to track a mapped region and to remember the circumstances under which it may be shared and the vm_list_struct structure is discarded as it's no longer required. This patch makes the following additional changes: (1) Regions are now allocated with alloc_pages() rather than kmalloc() and with no recourse to __GFP_COMP, so the pages are not composite. Instead, each page has a reference on it held by the region. Anything else that is interested in such a page will have to get a reference on it to retain it. When the pages are released due to unmapping, each page is passed to put_page() and will be freed when the page usage count reaches zero. (2) Excess pages are trimmed after an allocation as the allocation must be made as a power-of-2 quantity of pages. (3) VMAs are added to the parent MM's R/B tree and mmap lists. As an MM may end up with overlapping VMAs within the tree, the VMA struct address is appended to the sort key. (4) Non-anonymous VMAs are now added to the backing inode's prio list. (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of the backing region. The VMA and region structs will be split if necessary. (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory segment instead of all the attachments at that addresss. Multiple shmat()'s return the same address under NOMMU-mode instead of different virtual addresses as under MMU-mode. (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode. (8) /proc/maps is now the global list of mapped regions, and may list bits that aren't actually mapped anywhere. (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount of RAM currently allocated by mmap to hold mappable regions that can't be mapped directly. These are copies of the backing device or file if not anonymous. These changes make NOMMU mode more similar to MMU mode. The downside is that NOMMU mode requires some extra memory to track things over NOMMU without this patch (VMAs are no longer shared, and there are now region structs). Signed-off-by: David Howells Tested-by: Mike Frysinger Acked-by: Paul Mundt --- include/linux/mm.h | 18 ++++++------------ include/linux/mm_types.h | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4a3d28c8644..b91a73fd1bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -56,19 +56,9 @@ extern unsigned long mmap_min_addr; extern struct kmem_cache *vm_area_cachep; -/* - * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is - * disabled, then there's a single shared list of VMAs maintained by the - * system, and mm's subscribe to these individually - */ -struct vm_list_struct { - struct vm_list_struct *next; - struct vm_area_struct *vma; -}; - #ifndef CONFIG_MMU -extern struct rb_root nommu_vma_tree; -extern struct rw_semaphore nommu_vma_sem; +extern struct rb_root nommu_region_tree; +extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif @@ -1061,6 +1051,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_pages_min(void); extern void mem_init(void); +extern void __init mmap_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); @@ -1072,6 +1063,9 @@ extern void setup_per_cpu_pageset(void); static inline void setup_per_cpu_pageset(void) {} #endif +/* nommu.c */ +extern atomic_t mmap_pages_allocated; + /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9cfc9b627fd..1c1e0d3a171 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,22 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +/* + * A region containing a mapping of a non-memory backed file under NOMMU + * conditions. These are held in a global tree and are pinned by the VMAs that + * map parts of them. + */ +struct vm_region { + struct rb_node vm_rb; /* link in global region tree */ + unsigned long vm_flags; /* VMA vm_flags */ + unsigned long vm_start; /* start address of region */ + unsigned long vm_end; /* region initialised to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ + + atomic_t vm_usage; /* region usage count */ +}; + /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -152,7 +168,7 @@ struct vm_area_struct { unsigned long vm_truncate_count;/* truncate_count or restart_addr */ #ifndef CONFIG_MMU - atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ + struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ -- cgit v1.2.3 From dd8632a12e500a684478fea0951f380478d56fed Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make mmap allocation page trimming behaviour configurable. NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to the nearest power-of-2 number of pages. Currently it then discards the excess pages back to the page allocator, making that memory available for use by other things. This can, however, cause greater amount of fragmentation. To counter this, a sysctl is added in order to fine-tune the trimming behaviour. The default behaviour remains to trim pages aggressively, while this can either be disabled completely or set to a higher page-granular watermark in order to have finer-grained control. vm region vm_top bits taken from an earlier patch by David Howells. Signed-off-by: Paul Mundt Signed-off-by: David Howells Tested-by: Mike Frysinger --- Documentation/nommu-mmap.txt | 15 ++++++++++ Documentation/sysctl/vm.txt | 18 ++++++++++++ include/linux/mm_types.h | 1 + kernel/sysctl.c | 14 ++++++++++ mm/nommu.c | 65 ++++++++++++++++++++++++++++---------------- 5 files changed, 90 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 02b89dcf38a..b565e8279d1 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -248,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT Provision of shared mappings on block device files is exactly the same as for character devices. If there isn't a real device underneath, then the driver should allocate sufficient contiguous memory to honour any supported mapping. + + +================================= +ADJUSTING PAGE TRIMMING BEHAVIOUR +================================= + +NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages +when performing an allocation. This can have adverse effects on memory +fragmentation, and as such, is left configurable. The default behaviour is to +aggressively trim allocations and discard any excess pages back in to the page +allocator. In order to retain finer-grained control over fragmentation, this +behaviour can either be disabled completely, or bumped up to a higher page +watermark where trimming begins. + +Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index cd05994a49e..a3415070bca 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - nr_hugepages - nr_overcommit_hugepages +- nr_trim_pages (only if CONFIG_MMU=n) ============================================================== @@ -348,3 +349,20 @@ Change the maximum size of the hugepage pool. The maximum is nr_hugepages + nr_overcommit_hugepages. See Documentation/vm/hugetlbpage.txt + +============================================================== + +nr_trim_pages + +This is available only on NOMMU kernels. + +This value adjusts the excess page trimming behaviour of power-of-2 aligned +NOMMU mmap allocations. + +A value of 0 disables trimming of allocations entirely, while a value of 1 +trims excess pages aggressively. Any value >= 1 acts as the watermark where +trimming of allocations is initiated. + +The default value is 1. + +See Documentation/nommu-mmap.txt for more information. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c1e0d3a171..92915e81443 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -106,6 +106,7 @@ struct vm_region { unsigned long vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ + unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 92f6e5bc3c2..89d74436318 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ @@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#else + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = VM_LAPTOP_MODE, diff --git a/mm/nommu.c b/mm/nommu.c index 0d363dfcf10..a6e8ccfbd40 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer - * Copyright (c) 2007 Paul Mundt + * Copyright (c) 2007-2008 Paul Mundt */ #include @@ -66,6 +66,7 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; atomic_t mmap_pages_allocated; @@ -455,6 +456,8 @@ static noinline void validate_nommu_regions(void) last = rb_entry(lastp, struct vm_region, vm_rb); if (unlikely(last->vm_end <= last->vm_start)) BUG(); + if (unlikely(last->vm_top < last->vm_end)) + BUG(); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); @@ -462,7 +465,9 @@ static noinline void validate_nommu_regions(void) if (unlikely(region->vm_end <= region->vm_start)) BUG(); - if (unlikely(region->vm_start < last->vm_end)) + if (unlikely(region->vm_top < region->vm_end)) + BUG(); + if (unlikely(region->vm_start < last->vm_top)) BUG(); lastp = p; @@ -536,7 +541,7 @@ static void free_page_series(unsigned long from, unsigned long to) /* * release a reference to a region * - the caller must hold the region semaphore, which this releases - * - the region may not have been added to the tree yet, in which case vm_end + * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ static void __put_nommu_region(struct vm_region *region) @@ -547,7 +552,7 @@ static void __put_nommu_region(struct vm_region *region) BUG_ON(!nommu_region_tree.rb_node); if (atomic_dec_and_test(®ion->vm_usage)) { - if (region->vm_end > region->vm_start) + if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -558,7 +563,7 @@ static void __put_nommu_region(struct vm_region *region) * from ramfs/tmpfs mustn't be released here */ if (region->vm_flags & VM_MAPPED_COPY) { kdebug("free series"); - free_page_series(region->vm_start, region->vm_end); + free_page_series(region->vm_start, region->vm_top); } kmem_cache_free(vm_region_jar, region); } else { @@ -999,6 +1004,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) int ret; ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; + } if (ret != -ENOSYS) return ret; @@ -1027,11 +1036,14 @@ static int do_mmap_private(struct vm_area_struct *vma, */ if (vma->vm_file) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); - if (ret != -ENOSYS) { + if (ret == 0) { /* shouldn't return success if we're not sharing */ - BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); - return ret; /* success or a real error */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; } + if (ret != -ENOSYS) + return ret; /* getting an ENOSYS error indicates that direct mmap isn't * possible (as opposed to tried but failed) so we'll try to @@ -1051,23 +1063,25 @@ static int do_mmap_private(struct vm_area_struct *vma, if (!pages) goto enomem; - /* we allocated a power-of-2 sized page set, so we need to trim off the - * excess */ total = 1 << order; atomic_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; - while (total > point) { - order = ilog2(total - point); - n = 1 << order; - kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); - total -= n; - set_page_refcounted(pages + total); - __free_pages(pages + total, order); + + /* we allocated a power-of-2 sized page set, so we may want to trim off + * the excess */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } } - total = rlen >> PAGE_SHIFT; for (point = 1; point < total; point++) set_page_refcounted(&pages[point]); @@ -1075,6 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + rlen; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); vma->vm_start = region->vm_start; vma->vm_end = region->vm_start + len; @@ -1110,6 +1125,7 @@ error_free: free_page_series(region->vm_start, region->vm_end); region->vm_start = vma->vm_start = 0; region->vm_end = vma->vm_end = 0; + region->vm_top = 0; return ret; enomem: @@ -1401,7 +1417,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, npages = (addr - vma->vm_start) >> PAGE_SHIFT; if (new_below) { - region->vm_end = new->vm_end = addr; + region->vm_top = region->vm_end = new->vm_end = addr; } else { region->vm_start = new->vm_start = addr; region->vm_pgoff = new->vm_pgoff += npages; @@ -1418,6 +1434,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; } else { vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; } add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); @@ -1454,10 +1471,12 @@ static int shrink_vma(struct mm_struct *mm, down_write(&nommu_region_sem); delete_nommu_region(region); - if (from > region->vm_start) - region->vm_end = from; - else + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { region->vm_start = to; + } add_nommu_region(region); up_write(&nommu_region_sem); -- cgit v1.2.3