summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAlistair Popple <apopple@nvidia.com>2025-02-28 14:31:06 +1100
committerAndrew Morton <akpm@linux-foundation.org>2025-03-17 22:06:39 -0700
commit82ba975e4c43d98afebced82d940ddb7aec42a9d (patch)
treeaf8448e3338f4a6f3eab22f6c8c926deb6b4a498 /mm
parentb7e2823787735ca009e63f35f164b46df0ef096c (diff)
mm: allow compound zone device pages
Zone device pages are used to represent various type of device memory managed by device drivers. Currently compound zone device pages are not supported. This is because MEMORY_DEVICE_FS_DAX pages are the only user of higher order zone device pages and have their own page reference counting. A future change will unify FS DAX reference counting with normal page reference counting rules and remove the special FS DAX reference counting. Supporting that requires compound zone device pages. Supporting compound zone device pages requires compound_head() to distinguish between head and tail pages whilst still preserving the special struct page fields that are specific to zone device pages. A tail page is distinguished by having bit zero being set in page->compound_head, with the remaining bits pointing to the head page. For zone device pages page->compound_head is shared with page->pgmap. The page->pgmap field must be common to all pages within a folio, even if the folio spans memory sections. Therefore pgmap is the same for both head and tail pages and can be moved into the folio and we can use the standard scheme to find compound_head from a tail page. Link: https://lkml.kernel.org/r/67055d772e6102accf85161d0b57b0b3944292bf.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Signed-off-by: Balbir Singh <balbirs@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Dan Williams <dan.j.williams@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Tested-by: Alison Schofield <alison.schofield@intel.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Asahi Lina <lina@asahilina.net> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chunyan Zhang <zhang.lyra@gmail.com> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Dave Jiang <dave.jiang@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: John Hubbard <jhubbard@nvidia.com> Cc: linmiaohe <linmiaohe@huawei.com> Cc: Logan Gunthorpe <logang@deltatee.com> Cc: Matthew Wilcow (Oracle) <willy@infradead.org> Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Ted Ts'o <tytso@mit.edu> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memremap.c14
-rw-r--r--mm/migrate_device.c18
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c2
6 files changed, 26 insertions, 16 deletions
diff --git a/mm/hmm.c b/mm/hmm.c
index 7e0229ae4a5a6..082f7b7c0b9eb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -248,7 +248,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* just report the PFN.
*/
if (is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
diff --git a/mm/memory.c b/mm/memory.c
index 4c12a05fabd9c..c9ddd991abda3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4338,6 +4338,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
+ struct dev_pagemap *pgmap;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
/*
* migrate_to_ram is not yet ready to operate
@@ -4362,7 +4363,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
diff --git a/mm/memremap.c b/mm/memremap.c
index 07bbe0eed084a..68099af9df4cd 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -458,8 +458,8 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
- !folio->page.pgmap->ops->page_free))
+ if (WARN_ON_ONCE(!folio->pgmap->ops ||
+ !folio->pgmap->ops->page_free))
return;
mem_cgroup_uncharge(folio);
@@ -486,12 +486,12 @@ void free_zone_device_folio(struct folio *folio)
* to clear folio->mapping.
*/
folio->mapping = NULL;
- folio->page.pgmap->ops->page_free(folio_page(folio, 0));
+ folio->pgmap->ops->page_free(folio_page(folio, 0));
- switch (folio->page.pgmap->type) {
+ switch (folio->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_COHERENT:
- put_dev_pagemap(folio->page.pgmap);
+ put_dev_pagemap(folio->pgmap);
break;
case MEMORY_DEVICE_FS_DAX:
@@ -514,7 +514,7 @@ void zone_device_page_init(struct page *page)
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
set_page_count(page, 1);
lock_page(page);
}
@@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
- if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
+ if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;
/*
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5bd888223cc8b..7d0d64f67cdf0 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -106,6 +106,7 @@ again:
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
@@ -133,9 +134,10 @@ again:
goto next;
page = pfn_swap_entry_to_page(entry);
+ pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
+ pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -152,12 +154,16 @@ again:
}
page = vm_normal_page(migrate->vma, addr, pte);
if (page && !is_zone_device_page(page) &&
- !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- else if (page && is_device_coherent_page(page) &&
- (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
- page->pgmap->owner != migrate->pgmap_owner))
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
goto next;
+ } else if (page && is_device_coherent_page(page)) {
+ pgmap = page_pgmap(page);
+
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ pgmap->owner != migrate->pgmap_owner)
+ goto next;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index cde076fa7d5e5..3cb72b579ffd3 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -368,6 +368,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (is_huge_zero_pmd(*pmd))
goto out;
folio = pmd_folio(*pmd);
+ if (folio_is_zone_device(folio))
+ goto out;
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
diff --git a/mm/mm_init.c b/mm/mm_init.c
index dbb92fdf36fe5..73e97ce95f58a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1007,7 +1007,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
- page->pgmap = pgmap;
+ page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/*