diff options
-rw-r--r-- | mm/memory.c | 91 | ||||
-rw-r--r-- | mm/mprotect.c | 2 |
2 files changed, 72 insertions, 21 deletions
diff --git a/mm/memory.c b/mm/memory.c index 7751644f778b..d451b3125943 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -575,10 +575,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The disadvantage is that pages are refcounted - * (which can be slower and simply not an option for some PFNMAP users). The - * advantage is that we don't have to follow the strict linearity rule of - * PFNMAP mappings in order to support COWable mappings. + * normal pages by the VM. The only exception are zeropages, which are + * *never* refcounted. + * + * The disadvantage is that pages are refcounted (which can be slower and + * simply not an option for some PFNMAP users). The advantage is that we + * don't have to follow the strict linearity rule of PFNMAP mappings in + * order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -616,6 +619,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; + if (is_zero_pfn(pfn)) + return NULL; goto out; } else { unsigned long off; @@ -641,6 +646,7 @@ check_pfn: * eg. VDSO mappings can cause them to exist. */ out: + VM_WARN_ON_ONCE(is_zero_pfn(pfn)); return pfn_to_page(pfn); } @@ -1977,12 +1983,48 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, return pte_alloc_map_lock(mm, pmd, addr, ptl); } -static int validate_page_before_insert(struct page *page) +static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) +{ + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); + /* + * Whoever wants to forbid the zeropage after some zeropages + * might already have been mapped has to scan the page tables and + * bail out on any zeropages. Zeropages in COW mappings can + * be unshared using FAULT_FLAG_UNSHARE faults. + */ + if (mm_forbids_zeropage(vma->vm_mm)) + return false; + /* zeropages in COW mappings are common and unproblematic. */ + if (is_cow_mapping(vma->vm_flags)) + return true; + /* Mappings that do not allow for writable PTEs are unproblematic. */ + if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) + return true; + /* + * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could + * find the shared zeropage and longterm-pin it, which would + * be problematic as soon as the zeropage gets replaced by a different + * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would + * now differ to what GUP looked up. FSDAX is incompatible to + * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see + * check_vma_flags). + */ + return vma->vm_ops && vma->vm_ops->pfn_mkwrite && + (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); +} + +static int validate_page_before_insert(struct vm_area_struct *vma, + struct page *page) { struct folio *folio = page_folio(page); if (!folio_ref_count(folio)) return -EINVAL; + if (unlikely(is_zero_folio(folio))) { + if (!vm_mixed_zeropage_allowed(vma)) + return -EINVAL; + return 0; + } if (folio_test_anon(folio) || folio_test_slab(folio) || page_has_type(page)) return -EINVAL; @@ -1994,24 +2036,23 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { struct folio *folio = page_folio(page); + pte_t pteval; if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ - folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); - folio_add_file_rmap_pte(folio, page, vma); - set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); + pteval = mk_pte(page, prot); + if (unlikely(is_zero_folio(folio))) { + pteval = pte_mkspecial(pteval); + } else { + folio_get(folio); + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); + folio_add_file_rmap_pte(folio, page, vma); + } + set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } -/* - * This is the old fallback for page remapping. - * - * For historical reasons, it only allows reserved pages. Only - * old drivers should use this, and they needed to mark their - * pages reserved for the old functions anyway. - */ static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { @@ -2019,7 +2060,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = validate_page_before_insert(page); + retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; @@ -2037,7 +2078,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, { int err; - err = validate_page_before_insert(page); + err = validate_page_before_insert(vma, page); if (err) return err; return insert_page_into_pte_locked(vma, pte, addr, page, prot); @@ -2143,7 +2184,8 @@ EXPORT_SYMBOL(vm_insert_pages); * @page: source kernel page * * This allows drivers to insert individual pages they've allocated - * into a user vma. + * into a user vma. The zeropage is supported in some VMAs, + * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as @@ -2187,6 +2229,8 @@ EXPORT_SYMBOL(vm_insert_page); * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. + * The zeropage is supported in some VMAs, see + * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ @@ -2402,8 +2446,11 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) +static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) { + if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && + (mkwrite || !vm_mixed_zeropage_allowed(vma))) + return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; @@ -2422,7 +2469,8 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, pgprot_t pgprot = vma->vm_page_prot; int err; - BUG_ON(!vm_mixed_ok(vma, pfn)); + if (!vm_mixed_ok(vma, pfn, mkwrite)) + return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -3170,6 +3218,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && diff --git a/mm/mprotect.c b/mm/mprotect.c index 8c6cd8825273..888ef66468db 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -71,6 +71,8 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return page && PageAnon(page) && PageAnonExclusive(page); } + VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)); + /* * Writable MAP_SHARED mapping: "clean" might indicate that the FS still * needs a real write-fault for writenotify |