diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 780 |
1 files changed, 476 insertions, 304 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05fe1ddb033c..ef5070fed76b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> @@ -70,6 +71,7 @@ #include <linux/psi.h> #include <linux/padata.h> #include <linux/khugepaged.h> +#include <linux/buffer_head.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -78,6 +80,34 @@ #include "shuffle.h" #include "page_reporting.h" +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ +typedef int __bitwise fpi_t; + +/* No special request */ +#define FPI_NONE ((__force fpi_t)0) + +/* + * Skip free page reporting notification for the (possibly merged) page. + * This does not hinder free page reporting from grabbing the page, + * reporting it and marking it "reported" - it only skips notifying + * the free page reporting infrastructure about a newly freed page. For + * example, used when temporarily pulling a page from a freelist and + * putting it back unmodified. + */ +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) + +/* + * Place the (possibly merged) page to the tail of the freelist. Will ignore + * page shuffling (relevant code - e.g., memory onlining - is expected to + * shuffle the whole zone). + * + * Note: No code should rely on this flag for correctness - it's purely + * to allow for optimizations when handing back either fresh pages + * (memory onlining) or untouched pages (page isolation, free page + * reporting). + */ +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -137,53 +167,26 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_alloc); -#else DEFINE_STATIC_KEY_FALSE(init_on_alloc); -#endif EXPORT_SYMBOL(init_on_alloc); -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_free); -#else DEFINE_STATIC_KEY_FALSE(init_on_free); -#endif EXPORT_SYMBOL(init_on_free); +static bool _init_on_alloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) { - int ret; - bool bool_result; - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); - if (bool_result) - static_branch_enable(&init_on_alloc); - else - static_branch_disable(&init_on_alloc); - return 0; + return kstrtobool(buf, &_init_on_alloc_enabled_early); } early_param("init_on_alloc", early_init_on_alloc); +static bool _init_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); static int __init early_init_on_free(char *buf) { - int ret; - bool bool_result; - - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); - if (bool_result) - static_branch_enable(&init_on_free); - else - static_branch_disable(&init_on_free); - return 0; + return kstrtobool(buf, &_init_on_free_enabled_early); } early_param("init_on_free", early_init_on_free); @@ -247,7 +250,8 @@ bool pm_suspended_storage(void) unsigned int pageblock_order __read_mostly; #endif -static void __free_pages_ok(struct page *page, unsigned int order); +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -419,6 +423,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. @@ -466,14 +472,6 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, unsigned long pfn, @@ -492,6 +490,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, return (word >> bitidx) & mask; } +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long mask) { @@ -659,7 +665,7 @@ out: void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page)); + __free_pages_ok(page, compound_order(page), FPI_NONE); } void prep_compound_page(struct page *page, unsigned int order) @@ -699,19 +705,6 @@ static int __init early_debug_pagealloc(char *buf) } early_param("debug_pagealloc", early_debug_pagealloc); -void init_debug_pagealloc(void) -{ - if (!debug_pagealloc_enabled()) - return; - - static_branch_enable(&_debug_pagealloc_enabled); - - if (!debug_guardpage_minorder()) - return; - - static_branch_enable(&_debug_guardpage_enabled); -} - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -763,7 +756,54 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif -static inline void set_page_order(struct page *page, unsigned int order) +/* + * Enable static keys related to various memory debugging and hardening options. + * Some override others, and depend on early params that are evaluated in the + * order of appearance. So we need to first gather the full picture of what was + * enabled, and then make decisions. + */ +void init_mem_debugging_and_hardening(void) +{ + if (_init_on_alloc_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc\n"); + else + static_branch_enable(&init_on_alloc); + } + if (_init_on_free_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_free\n"); + else + static_branch_enable(&init_on_free); + } + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) + static_branch_enable(&_page_poisoning_enabled); +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; + + static_branch_enable(&_debug_pagealloc_enabled); + + if (!debug_guardpage_minorder()) + return; + + static_branch_enable(&_debug_guardpage_enabled); +#endif +} + +static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -788,7 +828,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy, if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; - if (page_order(buddy) != order) + if (buddy_order(buddy) != order) return false; /* @@ -873,13 +913,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, area->nr_free++; } -/* Used for pages which are on another list */ +/* + * Used for pages which are on another list. Move the pages to the tail + * of the list - so the moved pages won't immediately be considered for + * allocation again (e.g., optimization for memory onlining). + */ static inline void move_to_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { struct free_area *area = &zone->free_area[order]; - list_move(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->lru, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -952,7 +996,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype, bool report) + int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); unsigned long buddy_pfn; @@ -961,7 +1005,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -974,7 +1018,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: - while (order < max_order - 1) { + while (order < max_order) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1000,7 +1044,7 @@ continue_merging: pfn = combined_pfn; order++; } - if (max_order < MAX_ORDER) { + if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. * We want to prevent merge between freepages on isolate * pageblock and normal pageblock. Without this, pageblock @@ -1021,14 +1065,16 @@ continue_merging: is_migrate_isolate(buddy_mt))) goto done_merging; } - max_order++; + max_order = order + 1; goto continue_merging; } done_merging: - set_page_order(page, order); + set_buddy_order(page, order); - if (is_shuffle_order(order)) + if (fpi_flags & FPI_TO_TAIL) + to_tail = true; + else if (is_shuffle_order(order)) to_tail = shuffle_pick_tail(); else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); @@ -1039,7 +1085,7 @@ done_merging: add_to_free_list(page, zone, order, migratetype); /* Notify page reporting subsystem of freed page */ - if (report) + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } @@ -1057,7 +1103,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@ -1082,7 +1128,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1160,8 +1206,12 @@ static void kernel_init_free_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) + for (i = 0; i < numpages; i++) { + u8 tag = page_kasan_tag(page + i); + page_kasan_tag_reset(page + i); clear_highpage(page + i); + page_kasan_tag_set(page + i, tag); + } kasan_enable_current(); } @@ -1174,6 +1224,17 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); + if (unlikely(PageHWPoison(page)) && !order) { + /* + * Do not let hwpoison pages hit pcplists/buddy + * Untie memcg state and reset page's owner + */ + if (memcg_kmem_enabled() && PageMemcgKmem(page)) + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + return false; + } + /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. @@ -1198,7 +1259,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) bad += check_free_page(page); @@ -1218,7 +1279,8 @@ static __always_inline bool free_pages_prepare(struct page *page, if (want_init_on_free()) kernel_init_free_pages(page, 1 << order); - kernel_poison_pages(page, 1 << order, 0); + kernel_poison_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should @@ -1226,8 +1288,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 0); + debug_pagealloc_unmap_pages(page, 1 << order); kasan_free_nondeferred_pages(page, order); @@ -1298,7 +1359,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int prefetch_nr = 0; + int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; LIST_HEAD(head); @@ -1349,8 +1410,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, * avoid excessive prefetching due to large count, only * prefetch buddy for the first pcp->batch nr of pages. */ - if (prefetch_nr++ < pcp->batch) + if (prefetch_nr) { prefetch_buddy(page); + prefetch_nr--; + } } while (--count && --batch_free && !list_empty(list)); } @@ -1369,7 +1432,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1378,14 +1441,14 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, - int migratetype) + int migratetype, fpi_t fpi_flags) { spin_lock(&zone->lock); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype, true); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock(&zone->lock); } @@ -1463,7 +1526,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) } } -static void __free_pages_ok(struct page *page, unsigned int order) +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; int migratetype; @@ -1475,7 +1539,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); local_irq_restore(flags); } @@ -1485,6 +1550,11 @@ void __free_pages_core(struct page *page, unsigned int order) struct page *p = page; unsigned int loop; + /* + * When initializing the memmap, __init_single_page() sets the refcount + * of all pages to 1 ("allocated"/"not free"). We have to set the + * refcount of all involved pages to 0. + */ prefetchw(p); for (loop = 0; loop < (nr_pages - 1); loop++, p++) { prefetchw(p + 1); @@ -1495,20 +1565,33 @@ void __free_pages_core(struct page *page, unsigned int order) set_page_count(p, 0); atomic_long_add(nr_pages, &page_zone(page)->managed_pages); - set_page_refcounted(page); - __free_pages(page, order); + + /* + * Bypass PCP and place fresh pages right to the tail, primarily + * relevant for memory onlining. + */ + __free_pages_ok(page, order, FPI_TO_TAIL); } #ifdef CONFIG_NEED_MULTIPLE_NODES -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * treats start/end as pfns. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn, +static int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; @@ -1526,7 +1609,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return nid; } -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { @@ -2046,6 +2128,8 @@ void __init page_alloc_init_late(void) files_maxfiles_init(); #endif + buffer_init(); + /* Discard memblock private memory */ memblock_discard(); @@ -2121,7 +2205,7 @@ static inline void expand(struct zone *zone, struct page *page, continue; add_to_free_list(&page[size], zone, high, migratetype); - set_page_order(&page[size], high); + set_buddy_order(&page[size], high); } } @@ -2150,12 +2234,6 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(void) -{ - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled()) || want_init_on_free(); -} - #ifdef CONFIG_DEBUG_VM /* * With DEBUG_VM enabled, order-0 pages are checked for expected state when @@ -2213,11 +2291,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); kasan_alloc_pages(page, order); - kernel_poison_pages(page, 1 << order, 1); + kernel_unpoison_pages(page, 1 << order); set_page_owner(page, order, gfp_flags); + + if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2225,9 +2305,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags { post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); - if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2299,7 +2376,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the free lists of the requested type. + * Move the free pages in a range to the freelist tail of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ @@ -2335,7 +2412,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); VM_BUG_ON_PAGE(page_zone(page) != zone, page); - order = page_order(page); + order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; @@ -2413,12 +2490,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline void boost_watermark(struct zone *zone) +static inline bool boost_watermark(struct zone *zone) { unsigned long max_boost; if (!watermark_boost_factor) - return; + return false; /* * Don't bother in zones that are unlikely to produce results. * On small machines, including kdump capture kernels running @@ -2426,7 +2503,7 @@ static inline void boost_watermark(struct zone *zone) * memory situation immediately. */ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return; + return false; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); @@ -2440,12 +2517,14 @@ static inline void boost_watermark(struct zone *zone) * boosted watermark resulting in a hang. */ if (!max_boost) - return; + return false; max_boost = max(pageblock_nr_pages, max_boost); zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, max_boost); + + return true; } /* @@ -2459,7 +2538,7 @@ static inline void boost_watermark(struct zone *zone) static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { - unsigned int current_order = page_order(page); + unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; int old_block_type; @@ -2483,8 +2562,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * likelihood of future fallbacks. Wake kswapd now as the node * may be balanced overall and kswapd will not wake naturally. */ - boost_watermark(zone); - if (alloc_flags & ALLOC_KSWAPD) + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ @@ -2786,20 +2864,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; -#ifdef CONFIG_CMA - /* - * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. - */ - if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { - page = __rmqueue_cma_fallback(zone, order); - if (page) - return page; + if (IS_ENABLED(CONFIG_CMA)) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (alloc_flags & ALLOC_CMA && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + goto out; + } } -#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -2810,8 +2888,9 @@ retry: alloc_flags)) goto retry; } - - trace_mm_page_alloc_zone_locked(page, order, migratetype); +out: + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -2960,13 +3039,16 @@ static void drain_local_pages_wq(struct work_struct *work) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator. - * - * When zone parameter is non-NULL, spill just the single zone's pages. + * The implementation of drain_all_pages(), exposing an extra parameter to + * drain on all cpus. * - * Note that this can be extremely slow as the draining happens in a workqueue. + * drain_all_pages() is optimized to only execute on cpus where pcplists are + * not empty. The check for non-emptiness can however race with a free to + * pcplist that has not yet increased the pcp->count from 0 to 1. Callers + * that need the guarantee that every CPU has drained can disable the + * optimizing racy check. */ -void drain_all_pages(struct zone *zone) +static void __drain_all_pages(struct zone *zone, bool force_all_cpus) { int cpu; @@ -3005,7 +3087,13 @@ void drain_all_pages(struct zone *zone) struct zone *z; bool has_pcps = false; - if (zone) { + if (force_all_cpus) { + /* + * The pcp.count check is racy, some callers need a + * guarantee that no cpu is missed. + */ + has_pcps = true; + } else if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); if (pcp->pcp.count) has_pcps = true; @@ -3038,6 +3126,18 @@ void drain_all_pages(struct zone *zone) mutex_unlock(&pcpu_drain_mutex); } +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * When zone parameter is non-NULL, spill just the single zone's pages. + * + * Note that this can be extremely slow as the draining happens in a workqueue. + */ +void drain_all_pages(struct zone *zone) +{ + __drain_all_pages(zone, false); +} + #ifdef CONFIG_HIBERNATION /* @@ -3123,7 +3223,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype, + FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; @@ -3132,10 +3233,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) pcp = &this_cpu_ptr(zone->pageset)->pcp; list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= pcp->high) { - unsigned long batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, batch, pcp); - } + if (pcp->count >= READ_ONCE(pcp->high)) + free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); } /* @@ -3209,7 +3308,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -3278,7 +3377,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) lockdep_assert_held(&zone->lock); /* Return isolated page to tail of freelist. */ - __free_one_page(page, page_to_pfn(page), zone, order, mt, false); + __free_one_page(page, page_to_pfn(page), zone, order, mt, + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); } /* @@ -3319,7 +3419,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, + READ_ONCE(pcp->batch), list, migratetype, alloc_flags); if (unlikely(list_empty(list))) return NULL; @@ -3496,7 +3596,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ -static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); } @@ -4205,10 +4305,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); -static bool __need_fs_reclaim(gfp_t gfp_mask) +static bool __need_reclaim(gfp_t gfp_mask) { - gfp_mask = current_gfp_context(gfp_mask); - /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return false; @@ -4217,10 +4315,6 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) if (current->flags & PF_MEMALLOC) return false; - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) - return false; - if (gfp_mask & __GFP_NOLOCKDEP) return false; @@ -4239,15 +4333,29 @@ void __fs_reclaim_release(void) void fs_reclaim_acquire(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_acquire(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_acquire(); + +#ifdef CONFIG_MMU_NOTIFIER + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); +#endif + + } } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_release(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_release(); + } } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -4945,9 +5053,29 @@ static inline void free_the_page(struct page *page, unsigned int order) if (order == 0) /* Via pcp? */ free_unref_page(page); else - __free_pages_ok(page, order); + __free_pages_ok(page, order, FPI_NONE); } +/** + * __free_pages - Free pages allocated with alloc_pages(). + * @page: The page pointer returned from alloc_pages(). + * @order: The order of the allocation. + * + * This function can free multi-page allocations that are not compound + * pages. It does not check that the @order passed in matches that of + * the allocation, so it is easy to leak memory. Freeing more memory + * than was allocated will probably emit a warning. + * + * If the last reference to this page is speculative, it will be released + * by put_page() which only frees the first page of a non-compound + * allocation. To prevent the remaining pages from being leaked, we free + * the subsequent pages here. If you want to use the page's reference + * count to decide when to free the allocation, you should allocate a + * compound page, and use put_page() instead of __free_pages(). + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) @@ -5009,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -5044,6 +5173,11 @@ refill: if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; + if (unlikely(nc->pfmemalloc)) { + free_the_page(page, compound_order(page)); + goto refill; + } + #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* if size can vary use size else just use PAGE_SIZE */ size = nc->size; @@ -5057,11 +5191,12 @@ refill: } nc->pagecnt_bias--; + offset &= align_mask; nc->offset = offset; return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc); +EXPORT_SYMBOL(page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. @@ -5401,7 +5536,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_zone_page_state(NR_PAGETABLE), + global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -5433,6 +5568,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif + " pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5458,6 +5594,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif + K(node_page_state(pgdat, NR_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5489,7 +5626,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" @@ -5510,7 +5646,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), @@ -5840,7 +5975,10 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static void pageset_init(struct per_cpu_pageset *p); +/* These effectively disable the pcplists in the boot pageset completely */ +#define BOOT_PAGESET_HIGH 0 +#define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); @@ -5908,7 +6046,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + pageset_init(&per_cpu(boot_pageset, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -5979,10 +6117,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum meminit_context context, - struct vmem_altmap *altmap) + unsigned long start_pfn, unsigned long zone_end_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -6016,7 +6159,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) break; } @@ -6026,19 +6169,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, __SetPageReserved(page); /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. */ - if (!(pfn & (pageblock_nr_pages - 1))) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, migratetype); cond_resched(); } pfn++; @@ -6100,15 +6236,10 @@ void __ref memmap_init_zone_device(struct zone *zone, * the address space during boot when many long-lived * kernel allocations are made. * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - * * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (!(pfn & (pageblock_nr_pages - 1))) { + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } @@ -6142,8 +6273,8 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; - memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn, + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } } @@ -6198,13 +6329,16 @@ static int zone_batchsize(struct zone *zone) } /* - * pcp->high and pcp->batch values are related and dependent on one another: - * ->batch must never be higher then ->high. - * The following function updates them in a safe manner without read side - * locking. + * pcp->high and pcp->batch values are related and generally batch is lower + * than high. They are also related to pcp->count such that count is lower + * than high, and as soon as it reaches high, the pcplist is flushed. * - * Any new users of pcp->batch and pcp->high should ensure they can cope with - * those fields changing asynchronously (acording to the above rule). + * However, guaranteeing these relations at all times would require e.g. write + * barriers here but also careful usage of read barriers at the read side, and + * thus be prone to error and bad for performance. Thus the update only prevents + * store tearing. Any new users of pcp->batch and pcp->high should ensure they + * can cope with those fields changing asynchronously, and fully trust only the + * pcp->count field on the local CPU with interrupts disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters @@ -6213,21 +6347,8 @@ static int zone_batchsize(struct zone *zone) static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, unsigned long batch) { - /* start with a fail safe value for batch */ - pcp->batch = 1; - smp_wmb(); - - /* Update high, then batch, in order */ - pcp->high = high; - smp_wmb(); - - pcp->batch = batch; -} - -/* a companion to pageset_set_high() */ -static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) -{ - pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); + WRITE_ONCE(pcp->batch, batch); + WRITE_ONCE(pcp->high, high); } static void pageset_init(struct per_cpu_pageset *p) @@ -6240,53 +6361,70 @@ static void pageset_init(struct per_cpu_pageset *p) pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); + + /* + * Set batch and high values safe for a boot pageset. A true percpu + * pageset's initialization will update them subsequently. Here we don't + * need to be as careful as pageset_update() as nobody can access the + * pageset yet. + */ + pcp->high = BOOT_PAGESET_HIGH; + pcp->batch = BOOT_PAGESET_BATCH; } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, + unsigned long batch) { - pageset_init(p); - pageset_set_batch(p, batch); + struct per_cpu_pageset *p; + int cpu; + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_update(&p->pcp, high, batch); + } } /* - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist - * to the value high for the pageset p. + * Calculate and set new high and batch values for all per-cpu pagesets of a + * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. */ -static void pageset_set_high(struct per_cpu_pageset *p, - unsigned long high) +static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long batch = max(1UL, high / 4); - if ((high / 4) > (PAGE_SHIFT * 8)) - batch = PAGE_SHIFT * 8; + unsigned long new_high, new_batch; - pageset_update(&p->pcp, high, batch); -} + if (percpu_pagelist_fraction) { + new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; + new_batch = max(1UL, new_high / 4); + if ((new_high / 4) > (PAGE_SHIFT * 8)) + new_batch = PAGE_SHIFT * 8; + } else { + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); + } -static void pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) -{ - if (percpu_pagelist_fraction) - pageset_set_high(pcp, - (zone_managed_pages(zone) / - percpu_pagelist_fraction)); - else - pageset_set_batch(pcp, zone_batchsize(zone)); -} + if (zone->pageset_high == new_high && + zone->pageset_batch == new_batch) + return; -static void __meminit zone_pageset_init(struct zone *zone, int cpu) -{ - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + zone->pageset_high = new_high; + zone->pageset_batch = new_batch; - pageset_init(pcp); - pageset_set_high_and_batch(zone, pcp); + __zone_set_pageset_high_and_batch(zone, new_high, new_batch); } void __meminit setup_zone_pageset(struct zone *zone) { + struct per_cpu_pageset *p; int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_init(p); + } + + zone_set_pageset_high_and_batch(zone); } /* @@ -6329,6 +6467,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * offset of a (static) per cpu variable into the per cpu area. */ zone->pageset = &boot_pageset; + zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", @@ -6739,7 +6879,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); } @@ -7541,6 +7680,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char * alias for the memset(). */ direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); if ((unsigned int)poison <= 0xFF) memset(direct_map_addr, poison, PAGE_SIZE); @@ -7734,31 +7878,24 @@ static void calculate_totalreserve_pages(void) static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; - enum zone_type j, idx; + enum zone_type i, j; for_each_online_pgdat(pgdat) { - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone_managed_pages(zone); - - zone->lowmem_reserve[j] = 0; - - idx = j; - while (idx) { - struct zone *lower_zone; - - idx--; - lower_zone = pgdat->node_zones + idx; - - if (!sysctl_lowmem_reserve_ratio[idx] || - !zone_managed_pages(lower_zone)) { - lower_zone->lowmem_reserve[j] = 0; - continue; + for (i = 0; i < MAX_NR_ZONES - 1; i++) { + struct zone *zone = &pgdat->node_zones[i]; + int ratio = sysctl_lowmem_reserve_ratio[i]; + bool clear = !ratio || !zone_managed_pages(zone); + unsigned long managed_pages = 0; + + for (j = i + 1; j < MAX_NR_ZONES; j++) { + if (clear) { + zone->lowmem_reserve[j] = 0; } else { - lower_zone->lowmem_reserve[j] = - managed_pages / sysctl_lowmem_reserve_ratio[idx]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); + zone->lowmem_reserve[j] = managed_pages / ratio; } - managed_pages += zone_managed_pages(lower_zone); } } } @@ -8020,15 +8157,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -static void __zone_pcp_update(struct zone *zone) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); -} - /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu @@ -8061,7 +8189,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, goto out; for_each_populated_zone(zone) - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; @@ -8292,7 +8420,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, */ if (!page_ref_count(page)) { if (PageBuddy(page)) - iter += (1 << page_order(page)) - 1; + iter += (1 << buddy_order(page)) - 1; continue; } @@ -8457,9 +8585,11 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, 0); - if (ret < 0) + if (ret) return ret; + drain_all_pages(cc.zone); + /* * In case of -EBUSY, we'd like to know which page causes problem. * So, just fall through. test_pages_isolated() has a tracepoint @@ -8505,7 +8635,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } if (outer_start != start) { - order = page_order(pfn_to_page(outer_start)); + order = buddy_order(pfn_to_page(outer_start)); /* * outer_start page could be small order buddy page and @@ -8668,7 +8798,28 @@ EXPORT_SYMBOL(free_contig_range); void __meminit zone_pcp_update(struct zone *zone) { mutex_lock(&pcp_batch_high_lock); - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); + mutex_unlock(&pcp_batch_high_lock); +} + +/* + * Effectively disable pcplists for the zone by setting the high limit to 0 + * and draining all cpus. A concurrent page freeing on another CPU that's about + * to put the page on pcplist will either finish before the drain and the page + * will be drained, or observe the new high limit and skip the pcplist. + * + * Must be paired with a call to zone_pcp_enable(). + */ +void zone_pcp_disable(struct zone *zone) +{ + mutex_lock(&pcp_batch_high_lock); + __zone_set_pageset_high_and_batch(zone, 0, 1); + __drain_all_pages(zone, true); +} + +void zone_pcp_enable(struct zone *zone) +{ + __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); mutex_unlock(&pcp_batch_high_lock); } @@ -8693,35 +8844,21 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be in a single zone and isolated - * before calling this. + * All pages in the range must be in a single zone, must not contain holes, + * must span full sections, and must be isolated before calling this function. */ -unsigned long -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { + unsigned long pfn = start_pfn; struct page *page; struct zone *zone; unsigned int order; - unsigned long pfn; unsigned long flags; - unsigned long offlined_pages = 0; - - /* find the first valid pfn */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) - if (pfn_valid(pfn)) - break; - if (pfn == end_pfn) - return offlined_pages; offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); - pfn = start_pfn; while (pfn < end_pfn) { - if (!pfn_valid(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); /* * The HWPoisoned page may be not in buddy system, and @@ -8729,7 +8866,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - offlined_pages++; continue; } /* @@ -8740,20 +8876,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(PageBuddy(page)); pfn++; - offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); - order = page_order(page); - offlined_pages += 1 << order; + order = buddy_order(page); del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); - - return offlined_pages; } #endif @@ -8768,7 +8900,7 @@ bool is_free_buddy_page(struct page *page) for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && page_order(page_head) >= order) + if (PageBuddy(page_head) && buddy_order(page_head) >= order) break; } spin_unlock_irqrestore(&zone->lock, flags); @@ -8778,30 +8910,70 @@ bool is_free_buddy_page(struct page *page) #ifdef CONFIG_MEMORY_FAILURE /* - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This - * test is performed under the zone lock to prevent a race against page - * allocation. + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. */ -bool set_hwpoison_free_buddy_page(struct page *page) +static void break_down_buddy_pages(struct zone *zone, struct page *page, + struct page *target, int low, int high, + int migratetype) +{ + unsigned long size = 1 << high; + struct page *current_buddy, *next_page; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { + next_page = page + size; + current_buddy = page; + } else { + next_page = page; + current_buddy = page + size; + } + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + + if (current_buddy != target) { + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); + page = next_page; + } + } +} + +/* + * Take a page that will be marked as poisoned off the buddy allocator. + */ +bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; unsigned int order; - bool hwpoisoned = false; + bool ret = false; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); + int page_order = buddy_order(page_head); + + if (PageBuddy(page_head) && page_order >= order) { + unsigned long pfn_head = page_to_pfn(page_head); + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); - if (PageBuddy(page_head) && page_order(page_head) >= order) { - if (!TestSetPageHWPoison(page)) - hwpoisoned = true; + del_page_from_free_list(page_head, zone, page_order); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + ret = true; break; } + if (page_count(page_head) > 0) + break; } spin_unlock_irqrestore(&zone->lock, flags); - - return hwpoisoned; + return ret; } #endif |