diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/balloon_compaction.c | 6 | ||||
-rw-r--r-- | mm/damon/core.c | 23 | ||||
-rw-r--r-- | mm/damon/lru_sort.c | 5 | ||||
-rw-r--r-- | mm/damon/reclaim.c | 5 | ||||
-rw-r--r-- | mm/damon/sysfs-schemes.c | 2 | ||||
-rw-r--r-- | mm/damon/sysfs.c | 14 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 9 | ||||
-rw-r--r-- | mm/kasan/init.c | 12 | ||||
-rw-r--r-- | mm/kasan/kasan_test_c.c | 4 | ||||
-rw-r--r-- | mm/kasan/shadow.c | 53 | ||||
-rw-r--r-- | mm/khugepaged.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 37 | ||||
-rw-r--r-- | mm/memblock.c | 19 | ||||
-rw-r--r-- | mm/memory-failure.c | 28 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/migrate.c | 38 | ||||
-rw-r--r-- | mm/mprotect.c | 23 | ||||
-rw-r--r-- | mm/mremap.c | 95 | ||||
-rw-r--r-- | mm/numa_emulation.c | 4 | ||||
-rw-r--r-- | mm/numa_memblks.c | 6 | ||||
-rw-r--r-- | mm/percpu.c | 26 | ||||
-rw-r--r-- | mm/slub.c | 37 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 11 | ||||
-rw-r--r-- | mm/sparse.c | 15 | ||||
-rw-r--r-- | mm/userfaultfd.c | 26 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 4 | ||||
-rw-r--r-- | mm/zsmalloc.c | 10 |
29 files changed, 364 insertions, 181 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 2a4a649805c1..03c5dbabb156 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -254,4 +254,10 @@ const struct movable_operations balloon_mops = { .putback_page = balloon_page_putback, }; +static int __init balloon_init(void) +{ + return set_movable_ops(&balloon_mops, PGTY_offline); +} +core_initcall(balloon_init); + #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 52a48c9316bc..c2e0b469fd43 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -845,6 +845,18 @@ static struct damos_filter *damos_nth_filter(int n, struct damos *s) return NULL; } +static struct damos_filter *damos_nth_ops_filter(int n, struct damos *s) +{ + struct damos_filter *filter; + int i = 0; + + damos_for_each_ops_filter(filter, s) { + if (i++ == n) + return filter; + } + return NULL; +} + static void damos_commit_filter_arg( struct damos_filter *dst, struct damos_filter *src) { @@ -871,6 +883,7 @@ static void damos_commit_filter( { dst->type = src->type; dst->matching = src->matching; + dst->allow = src->allow; damos_commit_filter_arg(dst, src); } @@ -908,7 +921,7 @@ static int damos_commit_ops_filters(struct damos *dst, struct damos *src) int i = 0, j = 0; damos_for_each_ops_filter_safe(dst_filter, next, dst) { - src_filter = damos_nth_filter(i++, src); + src_filter = damos_nth_ops_filter(i++, src); if (src_filter) damos_commit_filter(dst_filter, src_filter); else @@ -2060,8 +2073,8 @@ static void damos_set_effective_quota(struct damos_quota *quota) if (quota->ms) { if (quota->total_charged_ns) - throughput = quota->total_charged_sz * 1000000 / - quota->total_charged_ns; + throughput = mult_frac(quota->total_charged_sz, 1000000, + quota->total_charged_ns); else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); @@ -2098,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) + quota->charged_from = jiffies; + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b..b5a5ed16a7a5 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) goto out; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b4596676..fb7c982a0018 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 74056bcd6a2c..6536f16006c9 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2158,8 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) { damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); kobject_put(&scheme->access_pattern->kobj); - kobject_put(&scheme->dests->kobj); damos_sysfs_dests_rm_dirs(scheme->dests); + kobject_put(&scheme->dests->kobj); damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cb..7b9254cadd5f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index d19031f275a3..830107b6dd08 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -990,29 +990,34 @@ static void __init destroy_args(struct pgtable_debug_args *args) /* Free page table entries */ if (args->start_ptep) { + pmd_clear(args->pmdp); pte_free(args->mm, args->start_ptep); mm_dec_nr_ptes(args->mm); } if (args->start_pmdp) { + pud_clear(args->pudp); pmd_free(args->mm, args->start_pmdp); mm_dec_nr_pmds(args->mm); } if (args->start_pudp) { + p4d_clear(args->p4dp); pud_free(args->mm, args->start_pudp); mm_dec_nr_puds(args->mm); } - if (args->start_p4dp) + if (args->start_p4dp) { + pgd_clear(args->pgdp); p4d_free(args->mm, args->start_p4dp); + } /* Free vma and mm struct */ if (args->vma) vm_area_free(args->vma); if (args->mm) - mmdrop(args->mm); + mmput(args->mm); } static struct page * __init diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c718..eed59cfb5d21 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5851,7 +5851,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5952,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ced6b29fcf76..8fce3370c84e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -13,9 +13,9 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/slab.h> +#include <linux/pgalloc.h> #include <asm/page.h> -#include <asm/pgalloc.h> #include "kasan.h" @@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, pud_t *pud; pmd_t *pmd; - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, } else { p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } } zero_pud_populate(p4d, addr, next); @@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * puds,pmds, so pgd_populate(), pud_populate() * is noops. */ - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, lm_alias(kasan_early_shadow_p4d)); p4d = p4d_offset(pgd, addr); - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, if (!p) return -ENOMEM; } else { - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, early_alloc(PAGE_SIZE, NUMA_NO_NODE)); } } diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 2aa12dfa427a..f4b17984b627 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -47,7 +47,7 @@ static struct { * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. */ -static volatile void *kasan_ptr_result; +static void *volatile kasan_ptr_result; static volatile int kasan_int_result; /* Probe for console output: obtains test_status lines of interest. */ @@ -1578,9 +1578,11 @@ static void kasan_strings(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO); strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE); + OPTIMIZER_HIDE_VAR(src); /* * Make sure that strscpy() does not trigger KASAN if it overreads into diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d2c70cd2afb1..11d472a5c4e8 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,8 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - if (likely(!pte_none(ptep_get(ptep)))) - return 0; + arch_leave_lazy_mmu_mode(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -320,6 +319,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); + arch_enter_lazy_mmu_mode(); + return 0; } @@ -335,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -353,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -385,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -414,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -461,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { - unsigned long page; + pte_t pte; + int none; - page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); + arch_leave_lazy_mmu_mode(); spin_lock(&init_mm.page_table_lock); - - if (likely(!pte_none(ptep_get(ptep)))) { + pte = ptep_get(ptep); + none = pte_none(pte); + if (likely(!none)) pte_clear(&init_mm, addr, ptep); - free_page(page); - } spin_unlock(&init_mm.page_table_lock); + if (likely(!none)) + __free_page(pfn_to_page(pte_pfn(pte))); + + arch_enter_lazy_mmu_mode(); + return 0; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 374a6a5193a7..b486c1d19b2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1172,11 +1172,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ + vma_start_write(vma); result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; - vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, @@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } if (!writable) { diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8d588e685311..1ac56ceb29b6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, else if (untagged_objp == untagged_ptr || alias) return object; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); + printk_deferred_exit(); break; } } @@ -470,6 +476,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; + bool warn = false; /* try the slab allocator first */ if (object_cache) { @@ -488,8 +495,10 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) else if (mem_pool_free_count) object = &mem_pool[--mem_pool_free_count]; else - pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); + warn = true; raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + if (warn) + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); return object; } @@ -733,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* @@ -740,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, * be freed while the kmemleak_lock is held. */ dump_object_info(parent); + printk_deferred_exit(); return -EEXIST; } } @@ -853,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size, raw_spin_lock_irqsave(&kmemleak_lock, flags); object = __find_and_remove_object(ptr, 1, objflags); - if (!object) { -#ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", - ptr, size); -#endif + if (!object) goto unlock; - } /* * Create one or two objects that may result from the memory block @@ -879,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size, unlock: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); - if (object) + if (object) { __delete_object(object); + } else { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); +#endif + } out: if (object_l) @@ -2181,6 +2197,7 @@ static const struct file_operations kmemleak_fops = { static void __kmemleak_do_cleanup(void) { struct kmemleak_object *object, *tmp; + unsigned int cnt = 0; /* * Kmemleak has already been disabled, no need for RCU list traversal @@ -2189,6 +2206,10 @@ static void __kmemleak_do_cleanup(void) list_for_each_entry_safe(object, tmp, &object_list, object_list) { __remove_object(object); __delete_object(object); + + /* Call cond_resched() once per 64 iterations to avoid soft lockup */ + if (!(++cnt & 0x3f)) + cond_resched(); } } diff --git a/mm/memblock.c b/mm/memblock.c index 154f1d73b61f..117d963e677c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -780,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt } if ((nr_pages << PAGE_SHIFT) > threshold_bytes) { - mem_size_mb = memblock_phys_mem_size() >> 20; + mem_size_mb = memblock_phys_mem_size() / SZ_1M; pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", - (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); + (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb); return false; } @@ -1091,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) /** * memblock_reserved_mark_noinit - Mark a reserved memory region with flag - * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized - * for this region. + * MEMBLOCK_RSRV_NOINIT + * * @base: the base phys addr of the region * @size: the size of the region * - * struct pages will not be initialized for reserved memory regions marked with - * %MEMBLOCK_RSRV_NOINIT. + * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will + * not be fully initialized to allow the caller optimize their initialization. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag + * completely bypasses the initialization of struct pages for such region. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this + * region will be initialized with default values but won't be marked as + * reserved. * * Return: 0 on success, -errno on failure. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e2e685b971bb..df6ee59527dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -853,9 +853,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif +static int hwpoison_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + /* We also want to consider pages mapped into VM_PFNMAP. */ + return 0; +} + static const struct mm_walk_ops hwpoison_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, + .test_walk = hwpoison_test_walk, .walk_lock = PGWALK_RDLOCK, }; @@ -948,7 +956,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1341,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -2086,12 +2095,11 @@ retry: *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2277,7 +2285,6 @@ try_again: goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2561,10 +2568,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc3..74318c787715 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); diff --git a/mm/migrate.c b/mm/migrate.c index 425401b2d4e1..9e5ef39ce73a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -43,8 +43,6 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> -#include <linux/balloon_compaction.h> -#include <linux/zsmalloc.h> #include <asm/tlbflush.h> @@ -53,6 +51,33 @@ #include "internal.h" #include "swap.h" +static const struct movable_operations *offline_movable_ops; +static const struct movable_operations *zsmalloc_movable_ops; + +int set_movable_ops(const struct movable_operations *ops, enum pagetype type) +{ + /* + * We only allow for selected types and don't handle concurrent + * registration attempts yet. + */ + switch (type) { + case PGTY_offline: + if (offline_movable_ops && ops) + return -EBUSY; + offline_movable_ops = ops; + break; + case PGTY_zsmalloc: + if (zsmalloc_movable_ops && ops) + return -EBUSY; + zsmalloc_movable_ops = ops; + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL_GPL(set_movable_ops); + static const struct movable_operations *page_movable_ops(struct page *page) { VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); @@ -62,15 +87,12 @@ static const struct movable_operations *page_movable_ops(struct page *page) * it as movable, the page type must be sticky until the page gets freed * back to the buddy. */ -#ifdef CONFIG_BALLOON_COMPACTION if (PageOffline(page)) /* Only balloon compaction sets PageOffline pages movable. */ - return &balloon_mops; -#endif /* CONFIG_BALLOON_COMPACTION */ -#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) + return offline_movable_ops; if (PageZsmalloc(page)) - return &zsmalloc_mops; -#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */ + return zsmalloc_movable_ops; + return NULL; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 78bded7acf79..113b48985834 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -120,9 +120,8 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, pte_t oldpte, pte_t *pte, int target_node, - struct folio **foliop) + struct folio *folio) { - struct folio *folio = NULL; bool ret = true; bool toptier; int nid; @@ -131,7 +130,6 @@ static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, if (pte_protnone(oldpte)) goto skip; - folio = vm_normal_folio(vma, addr, oldpte); if (!folio) goto skip; @@ -173,7 +171,6 @@ static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); skip: - *foliop = folio; return ret; } @@ -231,10 +228,9 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len, * retrieve sub-batches. */ static void commit_anon_folio_batch(struct vm_area_struct *vma, - struct folio *folio, unsigned long addr, pte_t *ptep, + struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { - struct page *first_page = folio_page(folio, 0); bool expected_anon_exclusive; int sub_batch_idx = 0; int len; @@ -251,7 +247,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma, } static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, - struct folio *folio, unsigned long addr, pte_t *ptep, + struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { bool set_write; @@ -270,7 +266,7 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, /* idx = */ 0, set_write, tlb); return; } - commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, tlb); + commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb); } static long change_pte_range(struct mmu_gather *tlb, @@ -305,15 +301,19 @@ static long change_pte_range(struct mmu_gather *tlb, const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE; int max_nr_ptes = (end - addr) >> PAGE_SHIFT; struct folio *folio = NULL; + struct page *page; pte_t ptent; + page = vm_normal_page(vma, addr, oldpte); + if (page) + folio = page_folio(page); /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { int ret = prot_numa_skip(vma, addr, oldpte, pte, - target_node, &folio); + target_node, folio); if (ret) { /* determine batch to skip */ @@ -323,9 +323,6 @@ static long change_pte_range(struct mmu_gather *tlb, } } - if (!folio) - folio = vm_normal_folio(vma, addr, oldpte); - nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); @@ -351,7 +348,7 @@ static long change_pte_range(struct mmu_gather *tlb, */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pte_write(ptent)) - set_write_prot_commit_flush_ptes(vma, folio, + set_write_prot_commit_flush_ptes(vma, folio, page, addr, pte, oldpte, ptent, nr_ptes, tlb); else prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, diff --git a/mm/mremap.c b/mm/mremap.c index 677a4d744df9..35de0a7b910e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -179,6 +179,10 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (max_nr == 1) return 1; + /* Avoid expensive folio lookup if we stand no chance of benefit. */ + if (pte_batch_hint(ptep, pte) == 1) + return 1; + folio = vm_normal_folio(vma, addr, pte); if (!folio || !folio_test_large(folio)) return 1; @@ -319,6 +323,25 @@ static inline bool arch_supports_page_table_move(void) } #endif +static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc) +{ + /* + * If we are moving a VMA that has uffd-wp registered but with + * remap events disabled (new VMA will not be registered with uffd), we + * need to ensure that the uffd-wp state is cleared from all pgtables. + * This means recursing into lower page tables in move_page_tables(). + * + * We might get called with VMAs reversed when recovering from a + * failed page table move. In that case, the + * "old"-but-actually-"originally new" VMA during recovery will not have + * a uffd context. Recursing into lower page tables during the original + * move but not during the recovery move will cause trouble, because we + * run into already-existing page tables. So check both VMAs. + */ + return !vma_has_uffd_without_event_remap(pmc->old) && + !vma_has_uffd_without_event_remap(pmc->new); +} + #ifdef CONFIG_HAVE_MOVE_PMD static bool move_normal_pmd(struct pagetable_move_control *pmc, pmd_t *old_pmd, pmd_t *new_pmd) @@ -331,6 +354,8 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc, if (!arch_supports_page_table_move()) return false; + if (!uffd_supports_page_table_move(pmc)) + return false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it. @@ -357,15 +382,6 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; - /* If this pmd belongs to a uffd vma with remap events disabled, we need - * to ensure that the uffd-wp state is cleared from all pgtables. This - * means recursing into lower page tables in move_page_tables(), and we - * can reuse the existing code if we simply treat the entry as "not - * moved". - */ - if (vma_has_uffd_without_event_remap(vma)) - return false; - /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -414,6 +430,8 @@ static bool move_normal_pud(struct pagetable_move_control *pmc, if (!arch_supports_page_table_move()) return false; + if (!uffd_supports_page_table_move(pmc)) + return false; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. @@ -421,15 +439,6 @@ static bool move_normal_pud(struct pagetable_move_control *pmc, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; - /* If this pud belongs to a uffd vma with remap events disabled, we need - * to ensure that the uffd-wp state is cleared from all pgtables. This - * means recursing into lower page tables in move_page_tables(), and we - * can reuse the existing code if we simply treat the entry as "not - * moved". - */ - if (vma_has_uffd_without_event_remap(vma)) - return false; - /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -1616,7 +1625,7 @@ static void notify_uffd(struct vma_remap_struct *vrm, bool failed) static bool vma_multi_allowed(struct vm_area_struct *vma) { - struct file *file; + struct file *file = vma->vm_file; /* * We can't support moving multiple uffd VMAs as notify requires @@ -1629,15 +1638,17 @@ static bool vma_multi_allowed(struct vm_area_struct *vma) * Custom get unmapped area might result in MREMAP_FIXED not * being obeyed. */ - file = vma->vm_file; - if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) { - const struct file_operations *fop = file->f_op; - - if (fop->get_unmapped_area) - return false; - } + if (!file || !file->f_op->get_unmapped_area) + return true; + /* Known good. */ + if (vma_is_shmem(vma)) + return true; + if (is_vm_hugetlb_page(vma)) + return true; + if (file->f_op->get_unmapped_area == thp_get_unmapped_area) + return true; - return true; + return false; } static int check_prep_vma(struct vma_remap_struct *vrm) @@ -1763,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; @@ -1814,10 +1828,11 @@ static unsigned long remap_move(struct vma_remap_struct *vrm) unsigned long start = vrm->addr; unsigned long end = vrm->addr + vrm->old_len; unsigned long new_addr = vrm->new_addr; - bool allowed = true, seen_vma = false; unsigned long target_addr = new_addr; unsigned long res = -EFAULT; unsigned long last_end; + bool seen_vma = false; + VMA_ITERATOR(vmi, current->mm, start); /* @@ -1830,9 +1845,7 @@ static unsigned long remap_move(struct vma_remap_struct *vrm) unsigned long addr = max(vma->vm_start, start); unsigned long len = min(end, vma->vm_end) - addr; unsigned long offset, res_vma; - - if (!allowed) - return -EFAULT; + bool multi_allowed; /* No gap permitted at the start of the range. */ if (!seen_vma && start < vma->vm_start) @@ -1861,9 +1874,15 @@ static unsigned long remap_move(struct vma_remap_struct *vrm) vrm->new_addr = target_addr + offset; vrm->old_len = vrm->new_len = len; - allowed = vma_multi_allowed(vma); - if (seen_vma && !allowed) - return -EFAULT; + multi_allowed = vma_multi_allowed(vma); + if (!multi_allowed) { + /* This is not the first VMA, abort immediately. */ + if (seen_vma) + return -EFAULT; + /* This is the first, but there are more, abort. */ + if (vma->vm_end < end) + return -EFAULT; + } res_vma = check_prep_vma(vrm); if (!res_vma) @@ -1872,7 +1891,7 @@ static unsigned long remap_move(struct vma_remap_struct *vrm) return res_vma; if (!seen_vma) { - VM_WARN_ON_ONCE(allowed && res_vma != new_addr); + VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr); res = res_vma; } diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 9d55679d99ce..703c8fa05048 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -73,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, } printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M); return 0; } @@ -264,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); if (size < min_size) { pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); + size / SZ_1M, min_size / SZ_1M); size = min_size; } size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 541a99c4071a..5b009a9cd8b4 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void) for (j = 0; j < cnt; j++) numa_distance[i * cnt + j] = i == j ? LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt); return 0; } @@ -427,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi) unsigned long pfn_align = node_map_pfn_alignment(); if (pfn_align && pfn_align < PAGES_PER_SECTION) { - unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; + unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M; - unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; + unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M; pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", node_align_mb, sect_align_mb); diff --git a/mm/percpu.c b/mm/percpu.c index d9cbaee92b60..81462ce5866e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ fail_unlock: fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { @@ -3108,7 +3112,7 @@ out_free: #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE @@ -3134,13 +3138,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (pgd_none(*pgd)) { p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_kernel(addr, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_kernel(addr, p4d, pud); } pud = pud_offset(p4d, addr); diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..d257141896c9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -962,19 +962,19 @@ static struct track *get_track(struct kmem_cache *s, void *object, } #ifdef CONFIG_STACKDEPOT -static noinline depot_stack_handle_t set_track_prepare(void) +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { depot_stack_handle_t handle; unsigned long entries[TRACK_ADDRS_COUNT]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + handle = stack_depot_save(entries, nr_entries, gfp_flags); return handle; } #else -static inline depot_stack_handle_t set_track_prepare(void) +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } @@ -996,9 +996,9 @@ static void set_track_update(struct kmem_cache *s, void *object, } static __always_inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) { - depot_stack_handle_t handle = set_track_prepare(); + depot_stack_handle_t handle = set_track_prepare(gfp_flags); set_track_update(s, object, alloc, addr, handle); } @@ -1140,7 +1140,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab, return; slab_bug(s, reason); - print_trailer(s, slab, object); + if (!object || !check_valid_pointer(s, slab, object)) { + print_slab_info(slab); + pr_err("Invalid pointer 0x%p\n", object); + } else { + print_trailer(s, slab, object); + } add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); WARN_ON(1); @@ -1921,9 +1926,9 @@ static inline bool free_debug_processing(struct kmem_cache *s, static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } -static inline depot_stack_handle_t set_track_prepare(void) { return 0; } +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) {} + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -3876,9 +3881,14 @@ new_objects: * For debug caches here we had to go through * alloc_single_from_partial() so just store the * tracking info and return the object. + * + * Due to disabled preemption we need to disallow + * blocking. The flags are further adjusted by + * gfp_nested_mask() in stack_depot itself. */ if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -3910,7 +3920,8 @@ new_objects: goto new_objects; if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -4421,8 +4432,12 @@ static noinline void free_to_partial_list( unsigned long flags; depot_stack_handle_t handle = 0; + /* + * We cannot use GFP_NOWAIT as there are callsites where waking up + * kswapd could deadlock + */ if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); + handle = set_track_prepare(__GFP_NOWARN); spin_lock_irqsave(&n->list_lock, flags); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index fd2ab5118e13..dbd8daccade2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,9 +27,9 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/pgalloc.h> #include <asm/dma.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" @@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) if (!p) return NULL; pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } return p4d; } @@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; - pgd_populate(&init_mm, pgd, p); + pgd_populate_kernel(addr, pgd, p); } return pgd; } @@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; - if (system_state == SYSTEM_BOOTING) - memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - else - memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 3c012cf83cc2..e6075b622407 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid) */ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; -#ifndef CONFIG_SPARSEMEM_VMEMMAP - memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); -#endif } static void __init sparse_buffer_fini(void) @@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, sparse_buffer_fini(); goto failed; } + memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), + PAGE_SIZE)); sparse_init_early_section(nid, map, pnum, 0); } } @@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) @@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); depopulate_section_memmap(pfn, nr_pages, altmap); - else if (memmap) + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); free_map_bootmem(memmap); + } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, section_deactivate(pfn, nr_pages, altmap); return ERR_PTR(-ENOMEM); } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index cbed91b09640..aefdf3a812a1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1453,10 +1453,15 @@ out: folio_unlock(src_folio); folio_put(src_folio); } - if (dst_pte) - pte_unmap(dst_pte); + /* + * Unmap in reverse order (LIFO) to maintain proper kmap_local + * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte + * first, then src_pte, so we must unmap src_pte first, then dst_pte. + */ if (src_pte) pte_unmap(src_pte); + if (dst_pte) + pte_unmap(dst_pte); mmu_notifier_invalidate_range_end(&range); if (si) put_swap_device(si); @@ -1821,13 +1826,16 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { - struct folio *folio = pmd_folio(*src_pmd); - - if (!folio || (!is_huge_zero_folio(folio) && - !PageAnonExclusive(&folio->page))) { - spin_unlock(ptl); - err = -EBUSY; - break; + /* Can be a migration entry */ + if (pmd_present(*src_pmd)) { + struct folio *folio = pmd_folio(*src_pmd); + + if (!is_huge_zero_folio(folio) && + !PageAnonExclusive(&folio->page)) { + spin_unlock(ptl); + err = -EBUSY; + break; + } } spin_unlock(ptl); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..5edd536ba9d2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2089,7 +2089,7 @@ retry: BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4826,7 +4826,7 @@ retry: /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7de11524a936..a48aec8bfd92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5772,9 +5772,9 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); - debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1, + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false, &lru_gen_rw_fops); - debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0, + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true, &lru_gen_ro_fops); return 0; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2c5e56a65354..805a10b41266 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2246,9 +2246,16 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { + int rc __maybe_unused; + #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif +#ifdef CONFIG_COMPACTION + rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); + if (rc) + return rc; +#endif zs_stat_init(); return 0; } @@ -2258,6 +2265,9 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif +#ifdef CONFIG_COMPACTION + set_movable_ops(NULL, PGTY_zsmalloc); +#endif zs_stat_exit(); } |