From acc53a0b4c156877773da6e9eea4113dc7e770ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 May 2025 19:15:07 +0100 Subject: mm: rename page->index to page->__folio_index All users of page->index have been converted to not refer to it any more. Update a few pieces of documentation that were missed and prevent new users from appearing (or at least make them easy to grep for). Link: https://lkml.kernel.org/r/20250514181508.3019795-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- mm/memory.c | 4 ++-- mm/page-writeback.c | 6 +++--- mm/truncate.c | 2 +- mm/zpdesc.h | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 09d005848f0d..bc244a2edc93 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping, xas_init_marks(&xas); folio->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ + /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } @@ -949,7 +949,7 @@ unlock: return 0; error: folio->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ + /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } diff --git a/mm/memory.c b/mm/memory.c index 5cb48f262ab0..37d8738f5e12 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4668,8 +4668,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9ff44b64d3d6..e8641aa1043e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2565,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping, if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers - * that hold pages in PageWriteback to aggregate I/O until + * that hold folios in writeback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the - * start of the file. Doing so causes a page lock/page + * start of the file. Doing so causes a folio lock/folio * writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping + * multiple folios in ascending folio->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ diff --git a/mm/truncate.c b/mm/truncate.c index 5d98054094d1..3c1bcdc3a3e9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -421,7 +421,7 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) continue; diff --git a/mm/zpdesc.h b/mm/zpdesc.h index 57e7a4d6c6ca..d3df316e5bb7 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -54,8 +54,8 @@ struct zpdesc { ZPDESC_MATCH(flags, flags); ZPDESC_MATCH(lru, lru); ZPDESC_MATCH(mapping, movable_ops); -ZPDESC_MATCH(index, next); -ZPDESC_MATCH(index, handle); +ZPDESC_MATCH(__folio_index, next); +ZPDESC_MATCH(__folio_index, handle); ZPDESC_MATCH(private, zspage); ZPDESC_MATCH(page_type, first_obj_offset); ZPDESC_MATCH(_refcount, _refcount); -- cgit v1.2.3 From 25352d2f2dc630b33cb61228dbdf7a0a2d6a4117 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:38 -0700 Subject: memcg: disable kmem charging in nmi for unsupported arch Patch series "memcg: nmi-safe kmem charging", v4. Users can attached their BPF programs at arbitrary execution points in the kernel and such BPF programs may run in nmi context. In addition, these programs can trigger memcg charged kernel allocations in the nmi context. However memcg charging infra for kernel memory is not equipped to handle nmi context for all architectures. This series removes the hurdles to enable kmem charging in the nmi context for most of the archs. For archs without CONFIG_HAVE_NMI, this series is a noop. For archs with NMI support and have CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS, the previous work to make memcg stats re-entrant is sufficient for allowing kmem charging in nmi context. For archs with NMI support but without CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS and with ARCH_HAVE_NMI_SAFE_CMPXCHG, this series added infra to support kmem charging in nmi context. Lastly those archs with NMI support but without CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS and ARCH_HAVE_NMI_SAFE_CMPXCHG, kmem charging in nmi context is not supported at all. Mostly used archs have support for CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS and this series should be almost a noop (other than making memcg_rstat_updated nmi safe) for such archs. This patch (of 5): The memcg accounting and stats uses this_cpu* and atomic* ops. There are archs which define CONFIG_HAVE_NMI but does not define CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS and ARCH_HAVE_NMI_SAFE_CMPXCHG, so memcg accounting for such archs in nmi context is not possible to support. Let's just disable memcg accounting in nmi context for such archs. Link: https://lkml.kernel.org/r/20250519063142.111219-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20250519063142.111219-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- init/Kconfig | 7 +++++++ mm/memcontrol.c | 3 +++ 2 files changed, 10 insertions(+) (limited to 'mm') diff --git a/init/Kconfig b/init/Kconfig index 4cdd1049283c..a2aa49cfb8bd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1006,6 +1006,13 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup. +config MEMCG_NMI_UNSAFE + bool + depends on MEMCG + depends on HAVE_NMI + depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !ARCH_HAVE_NMI_SAFE_CMPXCHG + default y + config MEMCG_V1 bool "Legacy cgroup v1 memory controller" depends on MEMCG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 35db91fddd1f..2532cc2316ee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2650,6 +2650,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) struct mem_cgroup *memcg; struct obj_cgroup *objcg; + if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) + return NULL; + if (in_task()) { memcg = current->active_memcg; if (unlikely(memcg)) -- cgit v1.2.3 From 940b01fc8dc1aead398819215650727cb9e7335e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:39 -0700 Subject: memcg: nmi safe memcg stats for specific archs There are archs which have NMI but does not support this_cpu_* ops safely in the nmi context but they support safe atomic ops in nmi context. For such archs, let's add infra to use atomic ops for the memcg stats which can be updated in nmi. At the moment, the memcg stats which get updated in the objcg charging path are MEMCG_KMEM, NR_SLAB_RECLAIMABLE_B & NR_SLAB_UNRECLAIMABLE_B. Rather than adding support for all memcg stats to be nmi safe, let's just add infra to make these three stats nmi safe which this patch is doing. Link: https://lkml.kernel.org/r/20250519063142.111219-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ init/Kconfig | 7 +++++++ mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f7848f73f41c..87b6688f124a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -113,6 +113,12 @@ struct mem_cgroup_per_node { CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC + /* slab stats for nmi context */ + atomic_t slab_reclaimable; + atomic_t slab_unreclaimable; +#endif }; struct mem_cgroup_threshold { @@ -236,6 +242,10 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC + /* MEMCG_KMEM for nmi context */ + atomic_t kmem_stat; +#endif /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode diff --git a/init/Kconfig b/init/Kconfig index a2aa49cfb8bd..e9f47baa34e0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1013,6 +1013,13 @@ config MEMCG_NMI_UNSAFE depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !ARCH_HAVE_NMI_SAFE_CMPXCHG default y +config MEMCG_NMI_SAFETY_REQUIRES_ATOMIC + bool + depends on MEMCG + depends on HAVE_NMI + depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && ARCH_HAVE_NMI_SAFE_CMPXCHG + default y + config MEMCG_V1 bool "Legacy cgroup v1 memory controller" depends on MEMCG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2532cc2316ee..e8e8becbe926 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3966,6 +3966,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac) } } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{ + int nid; + + if (atomic_read(&memcg->kmem_stat)) { + int kmem = atomic_xchg(&memcg->kmem_stat, 0); + int index = memcg_stats_index(MEMCG_KMEM); + + memcg->vmstats->state[index] += kmem; + if (parent) + parent->vmstats->state_pending[index] += kmem; + } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct lruvec_stats *lstats = pn->lruvec_stats; + struct lruvec_stats *plstats = NULL; + + if (parent) + plstats = parent->nodeinfo[nid]->lruvec_stats; + + if (atomic_read(&pn->slab_reclaimable)) { + int slab = atomic_xchg(&pn->slab_reclaimable, 0); + int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + if (atomic_read(&pn->slab_unreclaimable)) { + int slab = atomic_xchg(&pn->slab_unreclaimable, 0); + int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + } +} +#else +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{} +#endif + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -3974,6 +4021,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct aggregate_control ac; int nid; + flush_nmi_stats(memcg, parent, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); ac = (struct aggregate_control) { -- cgit v1.2.3 From 9d3edf96cef6d65ca2be287e48d40aec90081f3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:40 -0700 Subject: memcg: add nmi-safe update for MEMCG_KMEM The objcg based kmem charging and uncharging code path needs to update MEMCG_KMEM appropriately. Let's add support to update MEMCG_KMEM in nmi-safe way for those code paths. Link: https://lkml.kernel.org/r/20250519063142.111219-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8e8becbe926..74bb19fcafad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2715,6 +2715,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) return objcg; } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + if (likely(!in_nmi())) { + mod_memcg_state(memcg, MEMCG_KMEM, val); + } else { + /* TODO: add to cgroup update tree once it is nmi-safe. */ + atomic_add(val, &memcg->kmem_stat); + } +} +#else +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + mod_memcg_state(memcg, MEMCG_KMEM, val); +} +#endif + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2727,7 +2744,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + account_kmem_nmi_safe(memcg, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); if (!mem_cgroup_is_root(memcg)) refill_stock(memcg, nr_pages); @@ -2755,7 +2772,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + account_kmem_nmi_safe(memcg, nr_pages); memcg1_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); -- cgit v1.2.3 From 15ca4fa9045768dd798f101a09f0c27a2d1b0cdf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:41 -0700 Subject: memcg: nmi-safe slab stats updates The objcg based kmem [un]charging can be called in nmi context and it may need to update NR_SLAB_[UN]RECLAIMABLE_B stats. So, let's correctly handle the updates of these stats in the nmi context. Link: https://lkml.kernel.org/r/20250519063142.111219-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 74bb19fcafad..74de4e523094 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2515,17 +2515,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct lruvec *lruvec; + + if (likely(!in_nmi())) { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + } else { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; + + /* TODO: add to cgroup update tree once it is nmi-safe. */ + if (idx == NR_SLAB_RECLAIMABLE_B) + atomic_add(nr, &pn->slab_reclaimable); + else + atomic_add(nr, &pn->slab_unreclaimable); + } +} +#else +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); +} +#endif + static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); + account_slab_nmi_safe(memcg, pgdat, idx, nr); rcu_read_unlock(); } -- cgit v1.2.3 From 3ac4638a734abbeb1d4a02b042e26a80a4205975 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:42 -0700 Subject: memcg: make memcg_rstat_updated nmi safe Currently kernel maintains memory related stats updates per-cgroup to optimize stats flushing. The stats_updates is defined as atomic64_t which is not nmi-safe on some archs. Actually we don't really need 64bit atomic as the max value stats_updates can get should be less than nr_cpus * MEMCG_CHARGE_BATCH. A normal atomic_t should suffice. Also the function cgroup_rstat_updated() is still not nmi-safe but there is parallel effort to make it nmi-safe, so until then let's ignore it in the nmi context. Link: https://lkml.kernel.org/r/20250519063142.111219-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 74de4e523094..7e64dbf578d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -531,7 +531,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic64_t stats_updates; + atomic_t stats_updates; }; /* @@ -557,7 +557,7 @@ static u64 flush_last_time; static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&vmstats->stats_updates) > + return atomic_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } @@ -571,7 +571,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); + /* TODO: add to cgroup update tree once it is nmi-safe. */ + if (!in_nmi()) + cgroup_rstat_updated(memcg->css.cgroup, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { statc = this_cpu_ptr(statc_pcpu); @@ -589,7 +591,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, continue; stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); - atomic64_add(stats_updates, &statc->vmstats->stats_updates); + atomic_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -597,7 +599,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -4119,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic64_read(&memcg->vmstats->stats_updates)) - atomic64_set(&memcg->vmstats->stats_updates, 0); + if (atomic_read(&memcg->vmstats->stats_updates)) + atomic_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) -- cgit v1.2.3 From 8e1c4961f44be6172553c062d8f425a4a4357afa Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 17 May 2025 23:18:52 +0900 Subject: mm/damon/core: avoid destroyed target reference from DAMOS quota When the number of the monitoring targets in running contexts is reduced, there may be DAMOS quotas referencing the targets that will be destroyed. Applying the scheme action for such DAMOS scheme will be skipped forever looking for the starting part of the region for the destroyed monitoring target. To fix this issue, when the monitoring target is destroyed, reset the starting part for all DAMOS quotas that reference the target. Link: https://lkml.kernel.org/r/20250517141852.142802-1-akinobu.mita@gmail.com Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: Akinobu Mita Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 0bb71e2ab713..b217e0120e09 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1093,9 +1093,17 @@ static int damon_commit_targets( if (err) return err; } else { + struct damos *s; + if (damon_target_has_pid(dst)) put_pid(dst_target->pid); damon_destroy_target(dst_target); + damon_for_each_scheme(s, dst) { + if (s->quota.charge_target_from == dst_target) { + s->quota.charge_target_from = NULL; + s->quota.charge_addr_from = 0; + } + } } } -- cgit v1.2.3 From e08d5f515613a9860bfee7312461a19f422adb5e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:35 +0800 Subject: mm: shmem: avoid unpaired folio_unlock() in shmem_swapin_folio() Patch series "Some random fixes and cleanup to shmem", v3. This series contains some simple fixes and cleanup which are made during learning shmem. More details can be found in respective patches. This patch (of 5): If we get a folio from swap_cache_get_folio() successfully but encounter a failure before the folio is locked, we will unlock the folio which was not previously locked. Put the folio and set it to NULL when a failure occurs before the folio is locked to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250516170939.965736-2-shikemeng@huaweicloud.com Fixes: 058313515d5a ("mm: shmem: fix potential data corruption during shmem swapin") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Reviewed-by: Kairui Song Cc: Hugh Dickins Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..980fa15f393e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2335,6 +2335,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, */ split_order = shmem_split_large_entry(inode, index, swap, gfp); if (split_order < 0) { + folio_put(folio); + folio = NULL; error = split_order; goto failed; } -- cgit v1.2.3 From 594ec2ab389ab9cdada13ad85f503bbfc5658e84 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:36 +0800 Subject: mm: shmem: add missing shmem_unacct_size() in __shmem_file_setup() We will miss shmem_unacct_size() when is_idmapped_mnt() returns a failure. Move is_idmapped_mnt() before shmem_acct_size() to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-3-shikemeng@huaweicloud.com Fixes: 7a80e5b8c6fa ("shmem: support idmapped mounts for tmpfs") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 980fa15f393e..495e661eb8bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5812,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { -- cgit v1.2.3 From 3f778ab1b52450d467109321e2abfc4b36ec2d3e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:37 +0800 Subject: mm/shmem: fix potential dead loop in shmem_unuse() If multi shmem_unuse() for different swap type is called concurrently, a dead loop could occur as following: shmem_unuse(typeA) shmem_unuse(typeB) mutex_lock(&shmem_swaplist_mutex) list_for_each_entry_safe(info, next, ...) ... mutex_unlock(&shmem_swaplist_mutex) /* info->swapped may drop to 0 */ shmem_unuse_inode(&info->vfs_inode, type) mutex_lock(&shmem_swaplist_mutex) list_for_each_entry(info, next, ...) if (!info->swapped) list_del_init(&info->swaplist) ... mutex_unlock(&shmem_swaplist_mutex) mutex_lock(&shmem_swaplist_mutex) /* iterate with offlist entry and encounter a dead loop */ next = list_next_entry(info, swaplist); ... Restart the iteration if the inode is already off shmem_swaplist list to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-4-shikemeng@huaweicloud.com Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 495e661eb8bb..aeeddf612baa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1505,6 +1505,7 @@ int shmem_unuse(unsigned int type) return 0; mutex_lock(&shmem_swaplist_mutex); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1523,13 +1524,15 @@ int shmem_unuse(unsigned int type) cond_resched(); mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } mutex_unlock(&shmem_swaplist_mutex); -- cgit v1.2.3 From a5cdbe9f376f920af102fd0f0ecfd9952bb6bc40 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:38 +0800 Subject: mm: shmem: only remove inode from swaplist when it's swapped page count is 0 Even if we fail to allocate a swap entry, the inode might have previously allocated entry and we might take inode containing swap entry off swaplist. As a result, try_to_unuse() may enter a potential dead loop to repeatedly look for inode and clean it's swap entry. Only take inode off swaplist when it's swapped page count is 0 to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-5-shikemeng@huaweicloud.com Fixes: b487a2da3575 ("mm, swap: simplify folio swap allocation") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Reviewed-by: Kairui Song Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505161438.9009cf47-lkp@intel.com Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index aeeddf612baa..07b8e1400c67 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1651,8 +1651,8 @@ try_split: BUG_ON(folio_mapped(folio)); return swap_writepage(&folio->page, wbc); } - - list_del_init(&info->swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); if (nr_pages > 1) goto try_split; -- cgit v1.2.3 From c5a9deace6095c0dca86f5414493bcf1711f1ca3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:39 +0800 Subject: mm/shmem: remove unneeded xa_is_value() check in shmem_unuse_swap_entries() As only value entry will be added to fbatch in shmem_find_swap_entries(), there is no need to do xa_is_value() check in shmem_unuse_swap_entries(). Link: https://lkml.kernel.org/r/20250516170939.965736-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 07b8e1400c67..4b42419ce6b2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1446,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { -- cgit v1.2.3 From 3aefb1f06994ebbd94b7b2e5bf9cc3fecbd3eb39 Mon Sep 17 00:00:00 2001 From: Wenjie Xu Date: Thu, 15 May 2025 19:42:31 +0800 Subject: hugetlb: show nr_huge_pages in report_hugepages() The number of pre-allocated huge pages should be nr_huge_pages, not free_huge_pages, although they are same during booting stage Link: https://lkml.kernel.org/r/20250515114231.65824-1-xuwenjie04@baidu.com Signed-off-by: Wenjie Xu Signed-off-by: Li RongQing Acked-by: Oscar Salvador Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d53caf96a4b2..5a0bf1ea48f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3733,7 +3733,7 @@ static void __init report_hugepages(void) string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", - buf, h->free_huge_pages); + buf, h->nr_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", buf, nrinvalid, nrinvalid > 1 ? "s" : ""); -- cgit v1.2.3 From 0acfc656df50403cbd9dcb2d7b2a04fce161e6e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 20 May 2025 21:27:54 -0700 Subject: mm/damon/Kconfig: set DAMON_{VADDR,PADDR,SYSFS} default to DAMON Patch series "mm/damon: build-enable essential DAMON components by default". As of this writing, multiple major distros including Alma, Amazon, Android, CentOS, Debian, Fedora, and Oracle are build-enabling DAMON (set CONFIG_DAMON[1]). Configuring DAMON is not very easy, since it is disabled by default, and there are multiple essential options that need to be manually turned on, one by one. Make it easier, by grouping essential configurations to be enabled with one selection, and enabling build of the essential parts of DAMON by default. Note that build-enabling DAMON does not introduce any real risk, since it makes no behavioral change by default. It requires explicit user requests to do anything. Only one potential risk is making the size of the kernel a little bit larger. On a production-purpose configuration, it increases the resulting kernel package binary size by about 0.1 % of the final package file. I believe that's too small to be a real problem in common setups. DAMON_{VADDR,PADDR,SYSFS} are de-facto essential parts of DAMON for normal usages. Because those need to be enabled one by one, however, and there are other test-purpose or non-essential configurations, it is easy to be confused and make mistakes at setup. Make the essential configurations default to CONFIG_DAMON, so that those can be enabled by default with a single change. Link: https://oracle.github.io/kconfigs/?config=UTS_RELEASE&config=DAMON [1] Link: https://lkml.kernel.org/r/20250521042755.39653-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250521042755.39653-2-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Honggyu Kim Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index c213cf8b5638..c93d0c56b963 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -28,6 +28,7 @@ config DAMON_VADDR bool "Data access monitoring operations for virtual address spaces" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that work for virtual address spaces. @@ -36,6 +37,7 @@ config DAMON_PADDR bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that works for the physical address space. @@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_SYSFS bool "DAMON sysfs interface" depends on DAMON && SYSFS + default DAMON help This builds the sysfs interface for DAMON. The user space can use the interface for arbitrary data access monitoring. -- cgit v1.2.3 From 28615e6eed152f2fda5486680090b74aeed7b554 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 20 May 2025 21:27:55 -0700 Subject: mm/damon/Kconfig: enable CONFIG_DAMON by default As of this writing, multiple major distros including Alma, Amazon, Android, CentOS, Debian, Fedora, and Oracle are build-enabling DAMON (set CONFIG_DAMON[1]). Enabling it by default will save configuration setup time for the current and future DAMON users. Build-enabling DAMON does not introduce a real risk since it makes no behavioral change by default. It requires explicit user requests to do anything. Only one potential risk is making the size of the kernel a little bit larger. On a production-purpose configuration, it increases the resulting kernel package size by about 0.1 % of the final package file. I believe that's too small to be a real problem in common setups. Hence, the benefit of enabling CONFIG_DAMON outweighs the potential risk. Set CONFIG_DAMON by default. Link: https://oracle.github.io/kconfigs/?config=UTS_RELEASE&config=DAMON [1] Link: https://lkml.kernel.org/r/20250521042755.39653-3-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: Honggyu Kim Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index c93d0c56b963..551745df011b 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -4,6 +4,7 @@ menu "Data Access Monitoring" config DAMON bool "DAMON: Data Access Monitoring Framework" + default y help This builds a framework that allows kernel subsystems to monitor access frequency of each memory region. The information can be useful -- cgit v1.2.3 From bfe125f1b1870c7b5f05b489a525042d6715fcc1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 22 May 2025 01:28:38 +0000 Subject: mmu_gather: move tlb flush for VM_PFNMAP/VM_MIXEDMAP vmas into free_pgtables() Commit b67fbebd4cf9 ("mmu_gather: Force tlb-flush VM_PFNMAP vmas") added a forced tlbflush to tlb_vma_end(), which is required to avoid a race between munmap() and unmap_mapping_range(). However it added some overhead to other paths where tlb_vma_end() is used, but vmas are not removed, e.g. madvise(MADV_DONTNEED). Fix this by moving the tlb flush out of tlb_end_vma() into new tlb_flush_vmas() called from free_pgtables(), somewhat similar to the stable version of the original commit: commit 895428ee124a ("mm: Force TLB flush for PFNMAP mappings before unlink_file_vma()"). Note, that if tlb->fullmm is set, no flush is required, as the whole mm is about to be destroyed. Link: https://lkml.kernel.org/r/20250522012838.163876-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Jann Horn Acked-by: Hugh Dickins Acked-by: Peter Zijlstra (Intel) Cc: Will Deacon Cc: "Aneesh Kumar K.V" Cc: Nick Piggin Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 46 ++++++++++++++++++++++++++++++++++++---------- mm/memory.c | 2 ++ mm/mmu_gather.c | 1 + 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 88a42973fa47..1fff717cae51 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -58,6 +58,11 @@ * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * + * - tlb_free_vmas() + * + * tlb_free_vmas() marks the start of unlinking of one or more vmas + * and freeing page-tables. + * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories @@ -464,7 +469,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); - tlb->vma_pfn = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); + + /* + * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma + * in the tracked range, see tlb_free_vmas(). + */ + tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) @@ -547,23 +557,39 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) + return; + + /* + * Do a TLB flush and reset the range at VMA boundaries; this avoids + * the ranges growing with the unused space between consecutive VMAs, + * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on + * this. + */ + tlb_flush_mmu_tlbonly(tlb); +} + +static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the - * page mapcount -- there might not be page-frames for these PFNs after - * all. Force flush TLBs for such ranges to avoid munmap() vs - * unmap_mapping_range() races. + * page mapcount -- there might not be page-frames for these PFNs + * after all. + * + * Specifically() there is a race between munmap() and + * unmap_mapping_range(), where munmap() will unlink the VMA, such + * that unmap_mapping_range() will no longer observe the VMA and + * no-op, without observing the TLBI, returning prematurely. + * + * So if we're about to unlink such a VMA, and we have pending + * TLBI for such a vma, flush things now. */ - if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) { - /* - * Do a TLB flush and reset the range at VMA boundaries; this avoids - * the ranges growing with the unused space between consecutive VMAs. - */ + if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); - } } /* diff --git a/mm/memory.c b/mm/memory.c index 37d8738f5e12..8eba595056fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -358,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, { struct unlink_vma_file_batch vb; + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index db7ba4a725d6..b49cc6385f1f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif + tlb->vma_pfn = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); -- cgit v1.2.3 From e13e7922d03439e374c263049af5f740ceae6346 Mon Sep 17 00:00:00 2001 From: Juan Yescas Date: Wed, 21 May 2025 14:57:45 -0700 Subject: mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order Problem: On large page size configurations (16KiB, 64KiB), the CMA alignment requirement (CMA_MIN_ALIGNMENT_BYTES) increases considerably, and this causes the CMA reservations to be larger than necessary. This means that system will have less available MIGRATE_UNMOVABLE and MIGRATE_RECLAIMABLE page blocks since MIGRATE_CMA can't fallback to them. The CMA_MIN_ALIGNMENT_BYTES increases because it depends on MAX_PAGE_ORDER which depends on ARCH_FORCE_MAX_ORDER. The value of ARCH_FORCE_MAX_ORDER increases on 16k and 64k kernels. For example, in ARM, the CMA alignment requirement when: - CONFIG_ARCH_FORCE_MAX_ORDER default value is used - CONFIG_TRANSPARENT_HUGEPAGE is set: PAGE_SIZE | MAX_PAGE_ORDER | pageblock_order | CMA_MIN_ALIGNMENT_BYTES ----------------------------------------------------------------------- 4KiB | 10 | 9 | 4KiB * (2 ^ 9) = 2MiB 16Kib | 11 | 11 | 16KiB * (2 ^ 11) = 32MiB 64KiB | 13 | 13 | 64KiB * (2 ^ 13) = 512MiB There are some extreme cases for the CMA alignment requirement when: - CONFIG_ARCH_FORCE_MAX_ORDER maximum value is set - CONFIG_TRANSPARENT_HUGEPAGE is NOT set: - CONFIG_HUGETLB_PAGE is NOT set PAGE_SIZE | MAX_PAGE_ORDER | pageblock_order | CMA_MIN_ALIGNMENT_BYTES ------------------------------------------------------------------------ 4KiB | 15 | 15 | 4KiB * (2 ^ 15) = 128MiB 16Kib | 13 | 13 | 16KiB * (2 ^ 13) = 128MiB 64KiB | 13 | 13 | 64KiB * (2 ^ 13) = 512MiB This affects the CMA reservations for the drivers. If a driver in a 4KiB kernel needs 4MiB of CMA memory, in a 16KiB kernel, the minimal reservation has to be 32MiB due to the alignment requirements: reserved-memory { ... cma_test_reserve: cma_test_reserve { compatible = "shared-dma-pool"; size = <0x0 0x400000>; /* 4 MiB */ ... }; }; reserved-memory { ... cma_test_reserve: cma_test_reserve { compatible = "shared-dma-pool"; size = <0x0 0x2000000>; /* 32 MiB */ ... }; }; Solution: Add a new config CONFIG_PAGE_BLOCK_ORDER that allows to set the page block order in all the architectures. The maximum page block order will be given by ARCH_FORCE_MAX_ORDER. By default, CONFIG_PAGE_BLOCK_ORDER will have the same value that ARCH_FORCE_MAX_ORDER. This will make sure that current kernel configurations won't be affected by this change. It is a opt-in change. This patch will allow to have the same CMA alignment requirements for large page sizes (16KiB, 64KiB) as that in 4kb kernels by setting a lower pageblock_order. Tests: - Verified that HugeTLB pages work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. - Verified that Transparent Huge Pages work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. - Verified that dma-buf heaps allocations work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. Benchmarks: The benchmarks compare 16kb kernels with pageblock_order 10 and 7. The reason for the pageblock_order 7 is because this value makes the min CMA alignment requirement the same as that in 4kb kernels (2MB). - Perform 100K dma-buf heaps (/dev/dma_heap/system) allocations of SZ_8M, SZ_4M, SZ_2M, SZ_1M, SZ_64, SZ_8, SZ_4. Use simpleperf (https://developer.android.com/ndk/guides/simpleperf) to measure the # of instructions and page-faults on 16k kernels. The benchmark was executed 10 times. The averages are below: # instructions | #page-faults order 10 | order 7 | order 10 | order 7 -------------------------------------------------------- 13,891,765,770 | 11,425,777,314 | 220 | 217 14,456,293,487 | 12,660,819,302 | 224 | 219 13,924,261,018 | 13,243,970,736 | 217 | 221 13,910,886,504 | 13,845,519,630 | 217 | 221 14,388,071,190 | 13,498,583,098 | 223 | 224 13,656,442,167 | 12,915,831,681 | 216 | 218 13,300,268,343 | 12,930,484,776 | 222 | 218 13,625,470,223 | 14,234,092,777 | 219 | 218 13,508,964,965 | 13,432,689,094 | 225 | 219 13,368,950,667 | 13,683,587,37 | 219 | 225 ------------------------------------------------------------------- 13,803,137,433 | 13,131,974,268 | 220 | 220 Averages There were 4.85% #instructions when order was 7, in comparison with order 10. 13,803,137,433 - 13,131,974,268 = -671,163,166 (-4.86%) The number of page faults in order 7 and 10 were the same. These results didn't show any significant regression when the pageblock_order is set to 7 on 16kb kernels. - Run speedometer 3.1 (https://browserbench.org/Speedometer3.1/) 5 times on the 16k kernels with pageblock_order 7 and 10. order 10 | order 7 | order 7 - order 10 | (order 7 - order 10) % ------------------------------------------------------------------- 15.8 | 16.4 | 0.6 | 3.80% 16.4 | 16.2 | -0.2 | -1.22% 16.6 | 16.3 | -0.3 | -1.81% 16.8 | 16.3 | -0.5 | -2.98% 16.6 | 16.8 | 0.2 | 1.20% ------------------------------------------------------------------- 16.44 16.4 -0.04 -0.24% Averages The results didn't show any significant regression when the pageblock_order is set to 7 on 16kb kernels. Link: https://lkml.kernel.org/r/20250521215807.1860663-1-jyescas@google.com Signed-off-by: Juan Yescas Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: David Hildenbrand Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 16 ++++++++++++++++ include/linux/pageblock-flags.h | 8 ++++---- mm/Kconfig | 34 ++++++++++++++++++++++++++++++++++ mm/mm_init.c | 2 +- 4 files changed, 55 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b19a98c20de8..87a667533d6d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -37,6 +37,22 @@ #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) +/* Defines the order for the number of pages that have a migrate type. */ +#ifndef CONFIG_PAGE_BLOCK_ORDER +#define PAGE_BLOCK_ORDER MAX_PAGE_ORDER +#else +#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER +#endif /* CONFIG_PAGE_BLOCK_ORDER */ + +/* + * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated + * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER, + * which defines the order for the number of pages that can have a migrate type + */ +#if (PAGE_BLOCK_ORDER > MAX_PAGE_ORDER) +#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_ORDER +#endif + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index fc6b9c87cb0a..e73a4292ef02 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,18 +41,18 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #elif defined(CONFIG_TRANSPARENT_HUGEPAGE) -#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER) #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_PAGE_ORDER +/* If huge pages are not used, group by PAGE_BLOCK_ORDER */ +#define pageblock_order PAGE_BLOCK_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/Kconfig b/mm/Kconfig index bd08e151fa1b..f8bb8f070d0d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -993,6 +993,40 @@ config CMA_AREAS If unsure, leave the default value "8" in UMA and "20" in NUMA. +# +# Select this config option from the architecture Kconfig, if available, to set +# the max page order for physically contiguous allocations. +# +config ARCH_FORCE_MAX_ORDER + int + +# +# When ARCH_FORCE_MAX_ORDER is not defined, +# the default page block order is MAX_PAGE_ORDER (10) as per +# include/linux/mmzone.h. +# +config PAGE_BLOCK_ORDER + int "Page Block Order" + range 1 10 if ARCH_FORCE_MAX_ORDER = 0 + default 10 if ARCH_FORCE_MAX_ORDER = 0 + range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + help + The page block order refers to the power of two number of pages that + are physically contiguous and can have a migrate type associated to + them. The maximum size of the page block order is limited by + ARCH_FORCE_MAX_ORDER. + + This config allows overriding the default page block order when the + page block order is required to be smaller than ARCH_FORCE_MAX_ORDER + or MAX_PAGE_ORDER. + + Reducing pageblock order can negatively impact THP generation + success rate. If your workloads uses THP heavily, please use this + option with caution. + + Don't change if unsure. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS diff --git a/mm/mm_init.c b/mm/mm_init.c index 1c5444e188f8..8684fa851b84 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1509,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_PAGE_ORDER; + unsigned int order = PAGE_BLOCK_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) -- cgit v1.2.3 From 595cf683519ab5a277d258a2251ee8cc7b838d6d Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Mon, 26 May 2025 18:28:18 +0000 Subject: mm/khugepaged: fix race with folio split/free using temporary reference hpage_collapse_scan_file() calls is_refcount_suitable(), which in turn calls folio_mapcount(). folio_mapcount() checks folio_test_large() before proceeding to folio_large_mapcount(), but there is a race window where the folio may get split/freed between these checks, triggering: VM_WARN_ON_FOLIO(!folio_test_large(folio), folio) Take a temporary reference to the folio in hpage_collapse_scan_file(). This stabilizes the folio during refcount check and prevents incorrect large folio detection due to concurrent split/free. Use helper folio_expected_ref_count() + 1 to compare with folio_ref_count() instead of using is_refcount_suitable(). Link: https://lkml.kernel.org/r/20250526182818.37978-1-shivankg@amd.com Fixes: 05c5323b2a34 ("mm: track mapcount of large folios in single value") Signed-off-by: Shivank Garg Reported-by: syzbot+2b99589e33edbe9475ca@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6828470d.a70a0220.38f255.000c.GAE@google.com Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Dev Jain Reviewed-by: Baolin Wang Cc: Bharata B Rao Cc: Fengwei Yin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cdf5a581368b..7731a162a1a7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2293,6 +2293,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ @@ -2303,23 +2314,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * it's safe to skip LRU and refcount checks before * returning. */ + folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } @@ -2331,6 +2346,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, */ present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { xas_pause(&xas); -- cgit v1.2.3 From 52084f258e46f09a71063447df31cbd48c0cacd0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 28 May 2025 23:06:17 +0200 Subject: mm/gup: update comment explaining why gup_fast() disables IRQs The current comment in gup_fast() talks about "IPIs that come from THPs splitting", which is outdated and refers to the old THP splitting implementation that was removed in commit ad0bed24e98b ("thp: drop all split_huge_page()-related code"), which landed in v4.5. Before then, THP splitting involved a pmdp_splitting_flush(), which sent an IPI to serialize against gup_fast(). Nowadays, we use tlb_remove_table_sync_one() to send IPIs that serialize against gup_fast(); this is used, for example, in THP *collapsing* to stop gup_fast() walks of a page table before depositing it. Link: https://lkml.kernel.org/r/20250528-gup-irq-comment-fix-v1-1-b9d83c345333@google.com Signed-off-by: Jann Horn Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shuemov Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 329c5f7acc7a..e065a49842a8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3299,7 +3299,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs - * that come from THPs splitting. + * that come from callers of tlb_remove_table_sync_one(). */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); -- cgit v1.2.3 From ad6b26b6a0a79166b53209df2ca1cf8636296382 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 23 May 2025 20:51:15 +0800 Subject: sched/numa: add statistics of numa balance task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On systems with NUMA balancing enabled, it has been found that tracking task activities resulting from NUMA balancing is beneficial. NUMA balancing employs two mechanisms for task migration: one is to migrate a task to an idle CPU within its preferred node, and the other is to swap tasks located on different nodes when they are on each other's preferred nodes. The kernel already provides NUMA page migration statistics in /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched. However, it lacks statistics regarding task migration and swapping. Therefore, relevant counts for task migration and swapping should be added. The following two new fields: numa_task_migrated numa_task_swapped will be shown in /sys/fs/cgroup/{GROUP}/memory.stat, /proc/{PID}/sched and /proc/vmstat. Introducing both per-task and per-memory cgroup (memcg) NUMA balancing statistics facilitates a rapid evaluation of the performance and resource utilization of the target workload. For instance, users can first identify the container with high NUMA balancing activity and then further pinpoint a specific task within that group, and subsequently adjust the memory policy for that task. In short, although it is possible to iterate through /proc/$pid/sched to locate the problematic task, the introduction of aggregated NUMA balancing activity for tasks within each memcg can assist users in identifying the task more efficiently through a divide-and-conquer approach. As Libo Chen pointed out, the memcg event relies on the text names in vmstat_text, and /proc/vmstat generates corresponding items based on vmstat_text. Thus, the relevant task migration and swapping events introduced in vmstat_text also need to be populated by count_vm_numa_event(), otherwise these values are zero in /proc/vmstat. In theory, task migration and swap events are part of the scheduler's activities. The reason for exposing them through the memory.stat/vmstat interface is that we already have NUMA balancing statistics in memory.stat/vmstat, and these events are closely related to each other. Following Shakeel's suggestion, we describe the end-to-end flow/story of all these events occurring on a timeline for future reference: The goal of NUMA balancing is to co-locate a task and its memory pages on the same NUMA node. There are two strategies: migrate the pages to the task's node, or migrate the task to the node where its pages reside. Suppose a task p1 is running on Node 0, but its pages are located on Node 1. NUMA page fault statistics for p1 reveal its "page footprint" across nodes. If NUMA balancing detects that most of p1's pages are on Node 1: 1.Page Migration Attempt: The Numa balance first tries to migrate p1's pages to Node 0. The numa_page_migrate counter increments. 2.Task Migration Strategies: After the page migration finishes, Numa balance checks every 1 second to see if p1 can be migrated to Node 1. Case 2.1: Idle CPU Available If Node 1 has an idle CPU, p1 is directly scheduled there. This event is logged as numa_task_migrated. Case 2.2: No Idle CPU (Task Swap) If all CPUs on Node1 are busy, direct migration could cause CPU contention or load imbalance. Instead: The Numa balance selects a candidate task p2 on Node 1 that prefers Node 0 (e.g., due to its own page footprint). p1 and p2 are swapped. This cross-node swap is recorded as numa_task_swapped. Link: https://lkml.kernel.org/r/d00edb12ba0f0de3c5222f61487e65f2ac58f5b1.1748493462.git.yu.c.chen@intel.com Link: https://lkml.kernel.org/r/7ef90a88602ed536be46eba7152ed0d33bad5790.1748002400.git.yu.c.chen@intel.com Signed-off-by: Chen Yu Tested-by: K Prateek Nayak Tested-by: Madadi Vineeth Reddy Acked-by: Peter Zijlstra (Intel) Tested-by: Venkat Rao Bagalkote Cc: Aubrey Li Cc: Ayush Jain Cc: "Chen, Tim C" Cc: Ingo Molnar Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Libo Chen Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ include/linux/sched.h | 4 ++++ include/linux/vm_event_item.h | 2 ++ kernel/sched/core.c | 9 +++++++-- kernel/sched/debug.c | 4 ++++ mm/memcontrol.c | 2 ++ mm/vmstat.c | 2 ++ 7 files changed, 27 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index acf855851c03..cb279c69925e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1697,6 +1697,12 @@ The following nested keys are defined. numa_hint_faults (npn) Number of NUMA hinting faults. + numa_task_migrated (npn) + Number of task migration by NUMA balancing. + + numa_task_swapped (npn) + Number of task swap by NUMA balancing. + pgdemote_kswapd Number of pages demoted by kswapd. diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..1c50e30b5c01 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,10 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; +#ifdef CONFIG_NUMA_BALANCING + u64 numa_task_migrated; + u64 numa_task_swapped; +#endif u64 nr_wakeups; u64 nr_wakeups_sync; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..91a3ce9a2687 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c81cf642dba0..62b033199e9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3352,6 +3352,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { + __schedstat_inc(p->stats.numa_task_swapped); + count_vm_numa_event(NUMA_TASK_SWAP); + count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7953,8 +7957,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - /* TODO: This is not properly updating schedstats */ - + __schedstat_inc(p->stats.numa_task_migrated); + count_vm_numa_event(NUMA_TASK_MIGRATE); + count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56ae54e0ce6a..f971c2af7912 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); +#ifdef CONFIG_NUMA_BALANCING + P_SCHEDSTAT(numa_task_migrated); + P_SCHEDSTAT(numa_task_swapped); +#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e64dbf578d7..4e9771e6e340 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -474,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif }; diff --git a/mm/vmstat.c b/mm/vmstat.c index d888c248d99f..6f740f070b3d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = { "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", + "numa_task_migrated", + "numa_task_swapped", #endif #ifdef CONFIG_MIGRATION "pgmigrate_success", -- cgit v1.2.3 From 0b43b8bc8ef88bb45b018b2d4853d38bfc5ce2a7 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Mon, 26 May 2025 18:28:20 +0000 Subject: mm/khugepaged: clean up refcount check using folio_expected_ref_count() Use folio_expected_ref_count() instead of open-coded logic in is_refcount_suitable(). This avoids code duplication and improves clarity. Drop is_refcount_suitable() as it is no longer needed. Link: https://lkml.kernel.org/r/20250526182818.37978-2-shivankg@amd.com Signed-off-by: Shivank Garg Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Dev Jain Reviewed-by: Baolin Wang Cc: Bharata B Rao Cc: Fengwei Yin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7731a162a1a7..15203ea7d007 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct folio *folio) -{ - int expected_refcount = folio_mapcount(folio); - - if (!folio_test_anon(folio) || folio_test_swapcache(folio)) - expected_refcount += folio_nr_pages(folio); - - if (folio_test_private(folio)) - expected_refcount++; - - return folio_ref_count(folio) == expected_refcount; -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -1402,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } -- cgit v1.2.3