summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/internal.h10
-rw-r--r--mm/memcontrol.c32
-rw-r--r--mm/page_alloc.c2
4 files changed, 61 insertions, 18 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8d5a6141b951..635f0f0f6860 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2767,18 +2767,38 @@ out:
return ret;
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
void deferred_split_folio(struct folio *folio)
@@ -2797,14 +2817,11 @@ void deferred_split_folio(struct folio *folio)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/internal.h b/mm/internal.h
index 78db278c126d..b30907537801 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -413,11 +413,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -425,9 +425,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a7a6a1a23c7d..d2ceadd11b10 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5873,6 +5873,8 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -6237,7 +6239,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct page *page;
+ struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -6247,6 +6252,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
+ folio = page_folio(page);
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * page_remove_rmap() will find it still pmdmapped.
+ */
if (isolate_lru_page(page)) {
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
@@ -7153,9 +7180,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -7202,6 +7226,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -7495,6 +7520,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd9d9afbe1da..7272a922b838 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -600,7 +600,7 @@ void destroy_large_folio(struct folio *folio)
return;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_the_page(&folio->page, folio_order(folio));
}