From 9462f770eda85b754e82b089d8a0d195fa160837 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:13 -0500 Subject: ext4: Update stale comment about write constraints The comment above do_journal_get_write_access() is very stale. Most of it just does not refer to what the function does today or how jbd2 works. The bit about transaction handling during write(2) is still correct so just update the function names in that part and move the comment to a more appropriate place. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-2-tytso@mit.edu --- fs/ext4/inode.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..d27ef74b9d6f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1004,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, return ret; } -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage(). In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page. So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1149,6 +1125,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) -- cgit v1.2.3 From c8e8e16dbbf0840e6f41575c3d6158bd331218bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:14 -0500 Subject: ext4: Use nr_to_write directly in mpage_prepare_extent_to_map() When looking up extent of pages to map in mpage_prepare_extent_to_map() we count how many pages we still need to find in a copy of wbc->nr_to_write counter. With more complex page handling for data=journal mode, it will be easier to use wbc->nr_to_write directly so that we don't forget to carry over changes back to nr_to_write counter. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-3-tytso@mit.edu --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d27ef74b9d6f..ff913d0cd4b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2580,7 +2580,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; @@ -2613,7 +2612,9 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ @@ -2682,7 +2683,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; err = 0; } - left -= folio_nr_pages(folio); } folio_batch_release(&fbatch); cond_resched(); -- cgit v1.2.3 From 3f5d30636d2a188ee3cd22c6fef1ace5304a07bf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:15 -0500 Subject: ext4: Mark page for delayed dirtying only if it is pinned In data=journal mode, page should be dirtied only when it has buffers for checkpoint or it is writeably mapped. In the first case, we don't need to do anything special. In the second case, page was already added to the journal by ext4_page_mkwrite() and since transaction commit writeprotects mapped pages again, page should be writeable (and thus dirtied) only while it is part of the running transaction. So nothing needs to be done either. The only special case is when someone pins the page and uses this pin for modifying page data. So recognize this special case and only then mark the page as having data that needs adding to the journal. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-4-tytso@mit.edu --- fs/ext4/inode.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff913d0cd4b6..118eb674038a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3669,24 +3669,26 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); - folio_set_checked(folio); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } -- cgit v1.2.3 From f1496362e9d7b37fe6b8983086c1548a601b5594 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:16 -0500 Subject: ext4: Don't unlock page in ext4_bio_write_page() Do not unlock the written page in ext4_bio_write_page(). Instead leave the page locked and unlock it in the callers. We'll need to keep the page locked for data=journal writeback for a bit longer. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-5-tytso@mit.edu --- fs/ext4/inode.c | 2 ++ fs/ext4/page-io.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 118eb674038a..fcaa2a7a27a5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2076,6 +2076,7 @@ static int ext4_writepage(struct page *page, return -ENOMEM; } ret = ext4_bio_write_page(&io_submit, page, len); + unlock_page(page); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2110,6 +2111,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len); + unlock_page(page); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1e4db96a04e6..8703fd732abb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -502,7 +502,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Nothing to submit? Just unlock the page... */ if (!nr_to_submit) - goto unlock; + return 0; bh = head = page_buffers(page); @@ -550,7 +550,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } } @@ -565,7 +566,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, continue; io_submit_add_bh(io, inode, page, bounce_page, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - return ret; + + return 0; } -- cgit v1.2.3 From eaf2ca10ca4ba450f8e514cb8bfc9149660b57f6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:17 -0500 Subject: ext4: Move page unlocking out of mpage_submit_page() Move page unlocking during page writeback out of mpage_submit_page() into the callers. This will allow writeback in data=journal mode to keep the page locked for a bit longer. Since page unlocking it tightly connected to increment of mpd->first_page (as that determines cleanup of locked but unwritten pages), move page unlocking as well as mpd->first_page handling into a helper function. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-6-tytso@mit.edu --- fs/ext4/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fcaa2a7a27a5..85951da08d60 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2083,6 +2083,12 @@ static int ext4_writepage(struct page *page, return ret; } +static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) +{ + mpd->first_page++; + unlock_page(page); +} + static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; @@ -2111,10 +2117,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len); - unlock_page(page); if (!err) mpd->wbc->nr_to_write--; - mpd->first_page++; return err; } @@ -2226,6 +2230,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_page(mpd, head->b_page); + mpage_page_done(mpd, head->b_page); if (err < 0) return err; } @@ -2357,6 +2362,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); + mpage_page_done(mpd, page); if (err < 0) goto out; } @@ -2666,14 +2672,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { + if (ext4_page_nomap_can_writeout(&folio->page)) err = mpage_submit_page(mpd, &folio->page); - if (err < 0) - goto out; - } else { - folio_unlock(folio); - mpd->first_page += folio_nr_pages(folio); - } + mpage_page_done(mpd, &folio->page); + if (err < 0) + goto out; } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From d8be7607de039e5a477b0a5c63959d81ac052c3b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:18 -0500 Subject: ext4: Move mpage_page_done() calls after error handling In case mpage_submit_page() returns error, it doesn't really matter whether we call mpage_page_done() and then return error or whether we return directly because in that case page cleanup will be done by mpage_release_unused_pages() instead. Logically, it makes more sense to leave the cleanup to mpage_release_unused_pages() because we didn't succeed in writing the page. So move mpage_page_done() calls after the error handling. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-7-tytso@mit.edu --- fs/ext4/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 85951da08d60..472f6b914f48 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2230,9 +2230,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_page(mpd, head->b_page); - mpage_page_done(mpd, head->b_page); if (err < 0) return err; + mpage_page_done(mpd, head->b_page); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2362,9 +2362,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - mpage_page_done(mpd, page); if (err < 0) goto out; + mpage_page_done(mpd, page); } folio_batch_release(&fbatch); } @@ -2672,11 +2672,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) + if (ext4_page_nomap_can_writeout(&folio->page)) { err = mpage_submit_page(mpd, &folio->page); + if (err < 0) + goto out; + } mpage_page_done(mpd, &folio->page); - if (err < 0) - goto out; } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From 3f079114bf522f27f3680238b6429f3dd45535b6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:19 -0500 Subject: ext4: Convert data=journal writeback to use ext4_writepages() Add support for writeback of journalled data directly into ext4_writepages() instead of offloading it to write_cache_pages(). This actually significantly simplifies the code and reduces code duplication. For checkpointing of committed data we can use ext4_writepages() rightaway the same way as writeback of ordered data uses it on transaction commit. For journalling of dirty mapped pages, we need to add a special case to mpage_prepare_extent_to_map() to add all page buffers to the journal. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-8-tytso@mit.edu --- fs/ext4/inode.c | 341 ++++++++++++-------------------------------- include/trace/events/ext4.h | 7 - 2 files changed, 91 insertions(+), 257 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 472f6b914f48..652efb2221bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -1632,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1870,219 +1863,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; - loff_t size; - - ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } - /* - * We need to release the page lock before we start the - * journal, so grab a reference so the page won't disappear - * out from under us. - */ - get_page(page); - unlock_page(page); - - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - put_page(page); - goto out_no_pagelock; - } - BUG_ON(!ext4_handle_valid(handle)); - - lock_page(page); - put_page(page); - size = i_size_read(inode); - if (page->mapping != mapping || page_offset(page) > size) { - /* The page got truncated from under us */ - ext4_journal_stop(handle); - ret = 0; - goto out; - } - - if (inline_data) { - ret = ext4_mark_inode_dirty(handle, inode); - } else { - struct buffer_head *page_bufs = page_buffers(page); - - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, write_end_fn); - } - if (ret == 0) - ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: - unlock_page(page); -out_no_pagelock: - brelse(inode_bh); - return ret; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - struct ext4_io_submit io_submit; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - folio_invalidate(folio, 0, folio_size(folio)); - folio_unlock(folio); - return -EIO; - } - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT && - !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - /* Should never happen but for bugs in other kernel subsystems */ - if (!page_has_buffers(page)) { - ext4_warning_inode(inode, - "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); - return 0; - } - - page_bufs = page_buffers(page); - /* - * We cannot do block allocation or other extent handling in this - * function. If there are buffers needing that, we have to redirty - * the page. But we may reach here when we do a journal commit via - * journal_submit_inode_data_buffers() and in that case we must write - * allocated buffers to achieve data=ordered mode guarantees. - * - * Also, if there is only one buffer per page (the fs block - * size == the page size), if one buffer needs block - * allocation or needs to modify the extent tree to clear the - * unwritten flag, we know that the page can't be written at - * all, so we might as well refuse the write immediately. - * Unfortunately if the block size != page size, we can't as - * easily detect this case using ext4_walk_page_buffers(), but - * for the extremely common case, this is an optimization that - * skips a useless round trip through ext4_bio_write_page(). - */ - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_SIZE)) { - /* - * For memory cleaning there's no point in writing only - * some buffers. So just bail out. Warn if we came here - * from direct reclaim. - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) - == PF_MEMALLOC); - unlock_page(page); - return 0; - } - } - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - ret = ext4_bio_write_page(&io_submit, page, len); - unlock_page(page); - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; -} - static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) { mpd->first_page++; @@ -2563,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page) return false; } +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, + int len) +{ + struct buffer_head *page_bufs = page_buffers(page); + struct inode *inode = page->mapping->host; + int ret, err; + + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct page *page) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + int len; + + ClearPageChecked(page); + clear_page_dirty_for_io(page); + mpd->wbc->nr_to_write--; + + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + return ext4_journal_page_buffers(handle, page, len); +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages @@ -2595,11 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; @@ -2629,6 +2462,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no @@ -2668,8 +2508,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->first_page = folio->index; mpd->next_page = folio->index + folio_nr_pages(folio); /* - * Writeout for transaction commit where we cannot - * modify metadata is simple. Just submit the page. + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * is crutial so that forcing a transaction commit and + * then calling filemap_write_and_wait() guarantees + * current state of data is in its final location. Such + * sequence is used for example by insert/collapse + * range operations before discarding the page cache. */ if (!mpd->can_map) { if (ext4_page_nomap_can_writeout(&folio->page)) { @@ -2677,6 +2524,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (err < 0) goto out; } + /* Pending dirtying of journalled data? */ + if (PageChecked(&folio->page)) { + err = mpage_journal_page_buffers(handle, + mpd, &folio->page); + if (err < 0) + goto out; + } mpage_page_done(mpd, &folio->page); } else { /* Add all dirty buffers to mpd */ @@ -2694,18 +2548,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) cond_resched(); } mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); + if (handle) + ext4_journal_stop(handle); return err; } -static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) -{ - return ext4_writepage(&folio->page, wbc); -} - static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2731,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; - if (ext4_should_journal_data(inode)) { - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); - blk_finish_plug(&plug); - goto out_writepages; - } - /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that @@ -2772,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ext4_journal_stop(handle); } + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers + */ + if (ext4_should_journal_data(inode)) + mpd->can_map = 0; + if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in @@ -3148,9 +3000,8 @@ static int ext4_da_write_end(struct file *file, * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as neither - * ext4_writepage() nor certain ext4_writepages() paths not - * allocating blocks update i_disksize. + * check), we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). @@ -5376,7 +5227,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. @@ -6314,18 +6165,8 @@ retry_alloc: err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - do_journal_get_write_access)) - goto out_error; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - write_end_fn)) - goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, - page_offset(page), len)) + if (ext4_journal_page_buffers(handle, page, len)) goto out_error; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { unlock_page(page); } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 77b426ae0064..ebccf6a6aa1b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op, (unsigned long) __entry->index) ); -DEFINE_EVENT(ext4__page_op, ext4_writepage, - - TP_PROTO(struct page *page), - - TP_ARGS(page) -); - DEFINE_EVENT(ext4__page_op, ext4_readpage, TP_PROTO(struct page *page), -- cgit v1.2.3 From e6c28a26b799c7640b77daff3e4a67808c74381c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2023 15:25:28 +0100 Subject: ext4: Fix warnings when freezing filesystem with journaled data Test generic/390 in data=journal mode often triggers a warning that ext4_do_writepages() tries to start a transaction on frozen filesystem. This happens because although all dirty data is properly written, jbd2 checkpointing code writes data through submit_bh() and as a result only buffer dirty bits are cleared but page dirty bits stay set. Later when the filesystem is frozen, writeback code comes, tries to write supposedly dirty pages and the warning triggers. Fix the problem by calling sync_filesystem() once more after flushing the whole journal to clear stray page dirty bits. [ Applied fixup patches to address crashes when running data=journal tests; see links for more details -- TYT ] Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230308142528.12384-1-jack@suse.cz Reported-by: Eric Biggers Link: https://lore.kernel.org/all/20230319183617.GA896@sol.localdomain Link: https://lore.kernel.org/r/20230323145404.21381-1-jack@suse.cz Link: https://lore.kernel.org/r/20230323145404.21381-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 19 ++++++++++++++++--- fs/ext4/super.c | 11 +++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 652efb2221bf..6445b8017a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2410,6 +2410,7 @@ static int mpage_journal_page_buffers(handle_t *handle, static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; + struct super_block *sb = mpd->inode->i_sb; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; @@ -2427,15 +2428,23 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) else tag = PAGECACHE_TAG_DIRTY; - if (ext4_should_journal_data(mpd->inode)) { + mpd->map.m_len = 0; + mpd->next_page = index; + /* + * Start a transaction for writeback of journalled data. We don't start + * start the transaction if the filesystem is frozen. In that case we + * should not have any dirty data to write anymore but possibly there + * are stray page dirty bits left by the checkpointing code so this + * loop clears them. + */ + if (ext4_should_journal_data(mpd->inode) && + sb->s_writers.frozen < SB_FREEZE_FS) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); - mpd->map.m_len = 0; - mpd->next_page = index; while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); @@ -2520,12 +2529,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (!mpd->can_map) { if (ext4_page_nomap_can_writeout(&folio->page)) { + WARN_ON_ONCE(sb->s_writers.frozen == + SB_FREEZE_COMPLETE); err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } /* Pending dirtying of journalled data? */ if (PageChecked(&folio->page)) { + WARN_ON_ONCE(sb->s_writers.frozen >= + SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, mpd, &folio->page); if (err < 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f43e526112ae..f226f8ab469b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6293,6 +6293,17 @@ static int ext4_freeze(struct super_block *sb) if (error < 0) goto out; + /* + * Do another sync. We really should not have any dirty data + * anymore but our checkpointing code does not clear page dirty + * bits due to locking constraints so writeback still can get + * started for inodes with journalled data which triggers + * annoying warnings. + */ + error = sync_filesystem(sb); + if (error < 0) + goto out; + /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) -- cgit v1.2.3 From 98ccceee3e0637a37e20c1c12a08173663db77e7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 15 Mar 2023 14:34:18 -0400 Subject: ext4: fix comment: "start start" -> "start" in mpage_prepare_extent_to_map() Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6445b8017a8e..dbcc8b48c7ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2432,7 +2432,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = index; /* * Start a transaction for writeback of journalled data. We don't start - * start the transaction if the filesystem is frozen. In that case we + * the transaction if the filesystem is frozen. In that case we * should not have any dirty data to write anymore but possibly there * are stray page dirty bits left by the checkpointing code so this * loop clears them. -- cgit v1.2.3 From 66dabbb65d673aef40dd17bf62c042be8f6d4a4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:10 +0100 Subject: mm: return an ERR_PTR from __filemap_get_folio Instead of returning NULL for all errors, distinguish between: - no entry found and not asked to allocated (-ENOENT) - failed to allocate memory (-ENOMEM) - would block (-EAGAIN) so that callers don't have to guess the error based on the passed in flags. Also pass through the error through the direct callers: filemap_get_folio, filemap_lock_folio filemap_grab_folio and filemap_get_incore_folio. [hch@lst.de: fix null-pointer deref] Link: https://lkml.kernel.org/r/20230310070023.GA13563@lst.de Link: https://lkml.kernel.org/r/20230310043137.GA1624890@u2004 Link: https://lkml.kernel.org/r/20230307143410.28031-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi [nilfs2] Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/afs/dir.c | 10 +++++----- fs/afs/dir_edit.c | 2 +- fs/afs/write.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/move_extent.c | 8 ++++---- fs/hugetlbfs/inode.c | 2 +- fs/iomap/buffered-io.c | 11 ++--------- fs/netfs/buffered_read.c | 4 ++-- fs/nfs/file.c | 4 ++-- fs/nilfs2/page.c | 6 +++--- include/linux/pagemap.h | 11 ++++++----- mm/filemap.c | 14 ++++++++------ mm/folio-compat.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 ++++-- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/shmem.c | 4 ++-- mm/swap_state.c | 17 ++++++++++------- mm/swapfile.c | 4 ++-- mm/truncate.c | 15 ++++++++------- 21 files changed, 67 insertions(+), 65 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 82690d1dd49a..f92b9e62d567 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -319,16 +319,16 @@ expand: struct folio *folio; folio = filemap_get_folio(mapping, i); - if (!folio) { + if (IS_ERR(folio)) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; folio = __filemap_get_folio(mapping, i, FGP_LOCK | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto error; + } folio_attach_private(folio, (void *)1); folio_unlock(folio); } @@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, FGP_ACCESSED, 0); - if (!folio) { + if (IS_ERR(folio)) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..f0eddccbdd95 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); else if (folio && !folio_test_private(folio)) folio_attach_private(folio, (void *)1); diff --git a/fs/afs/write.c b/fs/afs/write.c index 571f3b9a417e..c822d6006033 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping, _debug("kill %lx (to %lx)", index, last); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } @@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, _debug("redirty %llx @%llx", len, start); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..d7973743417b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5395,7 +5395,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..7bf6d069199c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -141,18 +141,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9062da6da567..702d79639c0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, struct folio *folio; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6f4c97a6d7e9..96bb56c203f4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; - struct folio *folio; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (folio) - return folio; - - if (iter->flags & IOMAP_NOWAIT) - return ERR_PTR(-EAGAIN); - return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); - if (!folio) { + if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7679a68e8193..209726a9cfdb 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -350,8 +350,8 @@ int netfs_write_begin(struct netfs_inode *ctx, retry: folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 893625eacab9..1d03406e6c03 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,8 +336,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, start: folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); *pagep = &folio->page; ret = nfs_flush_incompatible(file, folio); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 41ccd43cd979..5cf30827f244 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -259,10 +259,10 @@ repeat: NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); - if (unlikely(!dfolio)) { + if (unlikely(IS_ERR(dfolio))) { /* No empty page is added to the page cache */ - err = -ENOMEM; folio_unlock(folio); + err = PTR_ERR(dfolio); break; } if (unlikely(!folio_buffers(folio))) @@ -311,7 +311,7 @@ repeat: folio_lock(folio); dfolio = filemap_lock_folio(dmap, index); - if (dfolio) { + if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); nilfs_copy_page(&dfolio->page, &folio->page, 0); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 306a0f63cea8..fdcd595d2294 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -520,7 +520,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned with an increased refcount. * - * Otherwise, %NULL is returned. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_get_folio(struct address_space *mapping, pgoff_t index) @@ -537,8 +538,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping, * present, it is returned locked with an increased refcount. * * Context: May sleep. - * Return: A folio or %NULL if there is no folio in the cache for this - * index. Will not return a shadow, swap or DAX entry. + * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for + * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_lock_folio(struct address_space *mapping, pgoff_t index) @@ -555,8 +556,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, * a new folio is created. The folio is locked, marked as accessed, and * returned. * - * Return: A found or created folio. NULL if no folio is found and failed to - * create a folio. + * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found + * and failed to create a folio. */ static inline struct folio *filemap_grab_folio(struct address_space *mapping, pgoff_t index) diff --git a/mm/filemap.c b/mm/filemap.c index ac161b50f5bc..a34abfe8c654 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1907,7 +1907,7 @@ out: * * If there is a page cache page, it is returned with an increased refcount. * - * Return: The found folio or %NULL otherwise. + * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp) @@ -1925,7 +1925,7 @@ repeat: if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); - return NULL; + return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); @@ -1964,7 +1964,7 @@ no_page: folio = filemap_alloc_folio(gfp, 0); if (!folio) - return NULL; + return ERR_PTR(-ENOMEM); if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; @@ -1989,6 +1989,8 @@ no_page: folio_unlock(folio); } + if (!folio) + return ERR_PTR(-ENOENT); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -3258,7 +3260,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); - if (likely(folio)) { + if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. @@ -3287,7 +3289,7 @@ retry_find: folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!folio) { + if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3638,7 +3640,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, 0); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 1754daa85d35..2511c055a35f 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -97,7 +97,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if (!folio) + if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 70008dd7f215..2d860e70fe88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3092,7 +3092,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, struct folio *folio = filemap_get_folio(mapping, index); nr_pages = 1; - if (!folio) + if (IS_ERR(folio)) continue; if (!folio_test_large(folio)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..712e32b38295 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5780,7 +5780,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, */ new_folio = false; folio = filemap_lock_folio(mapping, idx); - if (!folio) { + if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -6071,6 +6071,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma_end_reservation(h, vma, haddr); pagecache_folio = filemap_lock_folio(mapping, idx); + if (IS_ERR(pagecache_folio)) + pagecache_folio = NULL; } ptl = huge_pte_lock(h, mm, ptep); @@ -6182,7 +6184,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (is_continue) { ret = -EFAULT; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*pagep) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13ec89c45389..0524add35cae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5705,7 +5705,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ index = linear_page_index(vma, addr); folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); - if (!folio) + if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } diff --git a/mm/mincore.c b/mm/mincore.c index d359650b0f75..2d5be013a25a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -61,7 +61,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ folio = filemap_get_incore_folio(mapping, index); - if (folio) { + if (!IS_ERR(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } diff --git a/mm/shmem.c b/mm/shmem.c index 93cb39852a16..fa6e38f2f55f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,7 +605,7 @@ next: index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; folio = filemap_get_folio(inode->i_mapping, index); - if (!folio) + if (IS_ERR(folio)) goto drop; /* No huge page at the end of the file: nothing to split */ @@ -3214,7 +3214,7 @@ static const char *shmem_get_link(struct dentry *dentry, if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); - if (!folio) + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 92234f4b51d2..b76a65ac28b3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,7 +336,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, struct folio *folio; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - if (folio) { + if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -366,6 +366,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } + } else { + folio = NULL; } return folio; @@ -388,23 +390,24 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct swap_info_struct *si; struct folio *folio = filemap_get_entry(mapping, index); + if (!folio) + return ERR_PTR(-ENOENT); if (!xa_is_value(folio)) - goto out; + return folio; if (!shmem_mapping(mapping)) - return NULL; + return ERR_PTR(-ENOENT); swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) - return NULL; + return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) - return NULL; + return ERR_PTR(-ENOENT); index = swp_offset(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); -out: return folio; } @@ -431,7 +434,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (folio) + if (!IS_ERR(folio)) return folio_file_page(folio, swp_offset(entry)); /* diff --git a/mm/swapfile.c b/mm/swapfile.c index c1b97436f811..00b3e46becad 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -136,7 +136,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, int ret = 0; folio = filemap_get_folio(swap_address_space(entry), offset); - if (!folio) + if (IS_ERR(folio)) return 0; /* * When this function is called from scan_swap_map_slots() and it's @@ -2095,7 +2095,7 @@ retry: entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), i); - if (!folio) + if (IS_ERR(folio)) continue; /* diff --git a/mm/truncate.c b/mm/truncate.c index 7b4ea4c4a46b..86de31ed4d32 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -375,7 +375,7 @@ void truncate_inode_pages_range(struct address_space *mapping, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); - if (folio) { + if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio->index + folio_nr_pages(folio); @@ -387,14 +387,15 @@ void truncate_inode_pages_range(struct address_space *mapping, folio = NULL; } - if (!same_folio) + if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); - if (folio) { - if (!truncate_inode_partial_folio(folio, lstart, lend)) - end = folio->index; - folio_unlock(folio); - folio_put(folio); + if (!IS_ERR(folio)) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); + } } index = start; -- cgit v1.2.3 From 4da2f6e3c45999e904de1edcd06c8533715cc1b5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:05 +0000 Subject: ext4: Turn mpage_process_page() into mpage_process_folio() The page/folio is only used to extract the buffers, so this is a simple change. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-6-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dbcc8b48c7ba..398b0e505300 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2022,21 +2022,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* - * mpage_process_page - update page buffers corresponding to changed extent and - * may submit fully mapped page for IO - * - * @mpd - description of extent to map, on return next extent to map - * @m_lblk - logical block mapping. - * @m_pblk - corresponding physical mapping. - * @map_bh - determines on return whether this page requires any further + * mpage_process_folio - update folio buffers corresponding to changed extent + * and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further * mapping or not. - * Scan given page buffers corresponding to changed extent and update buffer + * + * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. - * If the given page is not fully mapped, we update @map to the next extent in - * the given page that needs mapping & return @map_bh as true. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true. */ -static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { @@ -2049,14 +2050,14 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. - * Find next buffer in the page to map. + * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; @@ -2129,9 +2130,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (nr == 0) break; for (i = 0; i < nr; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - err = mpage_process_page(mpd, page, &lblk, &pblock, + err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh @@ -2141,10 +2142,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, page); + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; - mpage_page_done(mpd, page); + mpage_page_done(mpd, &folio->page); } folio_batch_release(&fbatch); } -- cgit v1.2.3 From 81a0d3e126a0bb4300d1db259d89b839124f2cff Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:06 +0000 Subject: ext4: Convert mpage_submit_page() to mpage_submit_folio() All callers now have a folio so we can pass one in and use the folio APIs to support large folios as well as save instructions by eliminating calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-7-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 398b0e505300..dcb852e7e1cc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1869,34 +1869,33 @@ static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) unlock_page(page); } -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { - int len; + size_t len; loff_t size; int err; - BUG_ON(page->index != mpd->first_page); - clear_page_dirty_for_io(page); + BUG_ON(folio->index != mpd->first_page); + folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing - * data through mmap while writeback runs. clear_page_dirty_for_io() + * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get - * written to again until we release page lock. So only after - * clear_page_dirty_for_io() we are safe to sample i_size for + * written to again until we release folio lock. So only after + * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_page() to zero-out tail of the written page. We rely * on the barrier provided by TestClearPageDirty in - * clear_page_dirty_for_io() to make sure i_size is really sampled only + * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT && + len = folio_size(folio); + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len); + err = ext4_bio_write_page(&mpd->io_submit, &folio->page, len); if (!err) mpd->wbc->nr_to_write--; @@ -2009,7 +2008,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, head->b_page); + err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_page_done(mpd, head->b_page); @@ -2142,7 +2141,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_page_done(mpd, &folio->page); @@ -2532,12 +2531,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (ext4_page_nomap_can_writeout(&folio->page)) { WARN_ON_ONCE(sb->s_writers.frozen == SB_FREEZE_COMPLETE); - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; } /* Pending dirtying of journalled data? */ - if (PageChecked(&folio->page)) { + if (folio_test_checked(folio)) { WARN_ON_ONCE(sb->s_writers.frozen >= SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, -- cgit v1.2.3 From 33483b3b6ee4328f37c3dcf702ba979e6a00bf8f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:07 +0000 Subject: ext4: Convert mpage_page_done() to mpage_folio_done() All callers now have a folio so we can pass one in and use the folio APIs to support large folios as well as save instructions by eliminating a call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-8-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dcb852e7e1cc..916c923a25b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1863,10 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page++; - unlock_page(page); + mpd->first_page += folio_nr_pages(folio); + folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) @@ -2011,7 +2011,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; - mpage_page_done(mpd, head->b_page); + mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2144,7 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; - mpage_page_done(mpd, &folio->page); + mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } @@ -2544,7 +2544,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (err < 0) goto out; } - mpage_page_done(mpd, &folio->page); + mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From e8d6062c50acbf1aba88ca6adaa1bcda058abeab Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:08 +0000 Subject: ext4: Convert ext4_bio_write_page() to ext4_bio_write_folio() The only caller now has a folio so pass it in directly and avoid the call to page_folio() at the beginning. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-9-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++--- fs/ext4/inode.c | 6 +++--- fs/ext4/page-io.c | 10 ++++------ 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 993d3284b430..3535338caf0d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3756,9 +3756,8 @@ extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); -extern int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 916c923a25b8..2be604af7aec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1885,8 +1885,8 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for - * ext4_bio_write_page() to zero-out tail of the written page. We rely - * on the barrier provided by TestClearPageDirty in + * ext4_bio_write_folio() to zero-out tail of the written page. We rely + * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ @@ -1895,7 +1895,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - err = ext4_bio_write_page(&mpd->io_submit, &folio->page, len); + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index f0144ef39bb1..8fe1875b0a42 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -426,11 +426,9 @@ submit_and_retry: io->io_next_block++; } -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len) +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) { - struct folio *folio = page_folio(page); struct folio *io_folio = folio; struct inode *inode = folio->mapping->host; unsigned block_start; @@ -523,8 +521,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, - 0, gfp_flags); + bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && -- cgit v1.2.3 From 3edde93e07954a8860d67be4a2165514a083b6e8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:09 +0000 Subject: ext4: Convert ext4_readpage_inline() to take a folio Use the folio API in this function, saves a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-10-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/inline.c | 14 +++++++------- fs/ext4/inode.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3535338caf0d..6b21b3aa92d6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3549,7 +3549,7 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); -extern int ext4_readpage_inline(struct inode *inode, struct page *page); +int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1602d74b5eeb..e9bae3002319 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -501,7 +501,7 @@ out: return ret; } -int ext4_readpage_inline(struct inode *inode, struct page *page) +int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; @@ -515,16 +515,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ - if (!page->index) - ret = ext4_read_inline_page(inode, page); - else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!folio->index) + ret = ext4_read_inline_page(inode, &folio->page); + else if (!folio_test_uptodate(folio)) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); + folio_unlock(folio); return ret >= 0 ? 0 : ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2be604af7aec..81bcc73f610f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3155,7 +3155,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) trace_ext4_readpage(page); if (ext4_has_inline_data(inode)) - ret = ext4_readpage_inline(inode, page); + ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, page); -- cgit v1.2.3 From 4d934a5e6caa6dcdd3fbee7b96fe512a455863b6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:16 +0000 Subject: ext4: Convert ext4_write_begin() to use a folio Remove a lot of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-17-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 81bcc73f610f..d499417ffce7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1139,7 +1139,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, int ret, needed_blocks; handle_t *handle; int retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; unsigned from, to; @@ -1166,68 +1166,69 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page + * __filemap_get_folio() can take a long time if the + * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. + * the folio (if needed) without using GFP_NOFS. */ retry_grab: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ - if (!page_has_buffers(page)) - create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + if (!folio_buffers(folio)) + create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); - unlock_page(page); + folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - put_page(page); + folio_put(folio); return PTR_ERR(handle); } - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(page, pos, len, ext4_get_block); + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, - page_buffers(page), from, to, NULL, - do_journal_get_write_access); + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); - unlock_page(page); + folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -1255,10 +1256,10 @@ retry_journal: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - put_page(page); + folio_put(folio); return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } -- cgit v1.2.3 From 64fb31367598188a0a230b81c6f4397fa71fd033 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:17 +0000 Subject: ext4: Convert ext4_write_end() to use a folio Convert the incoming struct page to a folio. Replaces two implicit calls to compound_head() with one explicit call. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-18-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d499417ffce7..141e050de960 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1289,6 +1289,7 @@ static int ext4_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1304,7 +1305,7 @@ static int ext4_write_end(struct file *file, copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* - * it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree @@ -1312,15 +1313,15 @@ static int ext4_write_end(struct file *file, */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) -- cgit v1.2.3 From feb22b77b855a6529675b4e998970ab461c0f446 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:18 +0000 Subject: ext4: Use a folio in ext4_journalled_write_end() Convert the incoming page to a folio to remove a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-19-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 141e050de960..9cf3daa11534 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1392,6 +1392,7 @@ static int ext4_journalled_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1410,25 +1411,26 @@ static int ext4_journalled_write_end(struct file *file, if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); - if (unlikely(copied < len) && !PageUptodate(page)) { + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); -- cgit v1.2.3 From 86324a21627a40f949bf787b55c45b9856523f9d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:19 +0000 Subject: ext4: Convert ext4_journalled_zero_new_buffers() to use a folio Remove a call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-20-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9cf3daa11534..8d3b0742428d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1359,24 +1359,24 @@ static int ext4_write_end(struct file *file, */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, - struct page *page, + struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; - zero_user(page, start, size); + folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); @@ -1413,10 +1413,11 @@ static int ext4_journalled_write_end(struct file *file, if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, inode, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, inode, page, + ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), -- cgit v1.2.3 From 9d3973de9a3745ea9d38bdfb953a4c4bee81ac2a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:20 +0000 Subject: ext4: Convert __ext4_block_zero_page_range() to use a folio Use folio APIs throughout. Saves many calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230324180129.1220691-21-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d3b0742428d..bf646fc76e9d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3669,23 +3669,26 @@ static int __ext4_block_zero_page_range(handle_t *handle, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; - struct page *page; + struct folio *folio; int err = 0; - page = find_or_create_page(mapping, from >> PAGE_SHIFT, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (!page) + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (!folio) return -ENOMEM; blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + create_empty_buffers(&folio->page, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3707,7 +3710,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { @@ -3717,7 +3720,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { @@ -3733,7 +3736,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (err) goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { @@ -3747,8 +3750,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } -- cgit v1.2.3 From 02e4b04c56d03a518b958783900b22f33c6643d6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:21 +0000 Subject: ext4: Convert ext4_page_nomap_can_writeout to ext4_folio_nomap_can_writeout Its one caller already uses a folio. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-22-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf646fc76e9d..bf1a19c09b5d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2335,12 +2335,12 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the page needs to be written as part of transaction commit */ -static bool ext4_page_nomap_can_writeout(struct page *page) +/* Return true if the folio needs to be written as part of transaction commit */ +static bool ext4_folio_nomap_can_writeout(struct folio *folio) { struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) return true; @@ -2533,7 +2533,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * range operations before discarding the page cache. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { + if (ext4_folio_nomap_can_writeout(folio)) { WARN_ON_ONCE(sb->s_writers.frozen == SB_FREEZE_COMPLETE); err = mpage_submit_folio(mpd, folio); -- cgit v1.2.3 From 0b5a254395dc6db5c38d89e606c0298ed4c9e984 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:22 +0000 Subject: ext4: Use a folio in ext4_da_write_begin() Remove a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-23-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf1a19c09b5d..9e8afac5c82e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2902,7 +2902,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret, retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; @@ -2929,22 +2929,23 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -2959,7 +2960,7 @@ retry: return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } -- cgit v1.2.3 From c0be8e6f081b3e966e21f52679b2f809b7df10b8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:23 +0000 Subject: ext4: Convert ext4_mpage_readpages() to work on folios This definitely doesn't include support for large folios; there are all kinds of assumptions about the number of buffers attached to a folio. But it does remove several calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-24-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 7 +++---- fs/ext4/readpage.c | 58 ++++++++++++++++++++++++++---------------------------- 3 files changed, 32 insertions(+), 35 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6b21b3aa92d6..83f0cc02250f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3646,7 +3646,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page); + struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9e8afac5c82e..17a6607bcaf2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3154,17 +3154,16 @@ out: static int ext4_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret = -EAGAIN; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; - trace_ext4_readpage(page); + trace_ext4_readpage(&folio->page); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) - return ext4_mpage_readpages(inode, NULL, page); + return ext4_mpage_readpages(inode, NULL, folio); return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c61dc8a7c014..fed4ddb652df 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -218,7 +218,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) } int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page) + struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -247,16 +247,15 @@ int ext4_mpage_readpages(struct inode *inode, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (rac) { - page = readahead_page(rac); - prefetchw(&page->flags); - } + if (rac) + folio = readahead_folio(rac); + prefetchw(&folio->flags); - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; block_in_file = next_block = - (sector_t)page->index << (PAGE_SHIFT - blkbits); + (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; @@ -290,7 +289,7 @@ int ext4_mpage_readpages(struct inode *inode, /* * Then do more ext4_map_blocks() calls until we are - * done with this page. + * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { @@ -299,10 +298,10 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - SetPageError(page); - zero_user_segment(page, 0, - PAGE_SIZE); - unlock_page(page); + folio_set_error(folio); + folio_zero_segment(folio, 0, + folio_size(folio)); + folio_unlock(folio); goto next_page; } } @@ -333,22 +332,22 @@ int ext4_mpage_readpages(struct inode *inode, } } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, - PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, + folio_size(folio)); if (first_hole == 0) { - if (ext4_need_verity(inode, page->index) && - !fsverity_verify_page(page)) + if (ext4_need_verity(inode, folio->index) && + !fsverity_verify_page(&folio->page)) goto set_error_page; - SetPageUptodate(page); - unlock_page(page); - goto next_page; + folio_mark_uptodate(folio); + folio_unlock(folio); + continue; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this + * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || @@ -366,7 +365,7 @@ int ext4_mpage_readpages(struct inode *inode, REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); - ext4_set_bio_post_read_ctx(bio, inode, page->index); + ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) @@ -374,7 +373,7 @@ int ext4_mpage_readpages(struct inode *inode, } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) + if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && @@ -384,19 +383,18 @@ int ext4_mpage_readpages(struct inode *inode, bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; - goto next_page; + continue; confused: if (bio) { submit_bio(bio); bio = NULL; } - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), ext4_get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, ext4_get_block); else - unlock_page(page); - next_page: - if (rac) - put_page(page); + folio_unlock(folio); +next_page: + ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); -- cgit v1.2.3 From 86b38c273cc68ce7b50649447d8ac0ddf3228026 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:24 +0000 Subject: ext4: Convert ext4_block_write_begin() to take a folio All the callers now have a folio, so pass that in and operate on folios. Removes four calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230324180129.1220691-25-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 17a6607bcaf2..b6e556c71063 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1030,12 +1030,12 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, } #ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, +static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; @@ -1045,22 +1045,24 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, int nr_wait = 0; int i; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } bbits = ilog2(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; @@ -1073,19 +1075,20 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, to, block_end, - block_start, from); + folio_zero_segments(folio, to, + block_end, + block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } @@ -1105,14 +1108,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), - blocksize, - bh_offset(wait[i])); + err2 = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; @@ -1206,11 +1208,10 @@ retry_journal: #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(&folio->page, pos, len, + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(&folio->page, pos, len, - ext4_get_block); + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(&folio->page, pos, len, @@ -2938,8 +2939,7 @@ retry: folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(&folio->page, pos, len, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif -- cgit v1.2.3 From 9ea0e45bd2f6cbfba787360f5ba8e18deabb7671 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:25 +0000 Subject: ext4: Use a folio in ext4_page_mkwrite() Convert to the folio API, saving a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-26-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b6e556c71063..6d628d6c0847 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6075,7 +6075,7 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; @@ -6119,19 +6119,18 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) goto out_ret; } - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) > size) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time @@ -6139,17 +6138,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ - if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), + if (folio_buffers(folio)) { + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } - unlock_page(page); + folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; @@ -6170,26 +6169,25 @@ retry_alloc: if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { + if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); - err = __block_write_begin(page, 0, len, ext4_get_block); + err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_journal_page_buffers(handle, page, len)) + if (ext4_journal_page_buffers(handle, &folio->page, len)) goto out_error; } else { - unlock_page(page); + folio_unlock(folio); } } ext4_journal_stop(handle); @@ -6202,7 +6200,7 @@ out: sb_end_pagefault(inode->i_sb); return ret; out_error: - unlock_page(page); + folio_unlock(folio); ext4_journal_stop(handle); goto out; } -- cgit v1.2.3 From d84c9ebdac1e39bc7b036c0c829ee8c1956edabc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:33 +0200 Subject: ext4: Mark pages with journalled data dirty Currently pages with journalled data written by write(2) or modified by block zeroing during truncate(2) are not marked as dirty. They are dirtied only once the transaction commits. This however makes writeback code think inode has no pages to write and so ext4_writepages() is not called to make pages with journalled data persistent. Mark pages with journalled data dirty (similarly as it happens for writes through mmap) so that writeback code knows about them and ext4_writepages() can do what it needs to to the inode. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6d628d6c0847..c5de0c04204c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1003,6 +1003,18 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, return ret; } +/* + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable. + */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ + folio_mark_dirty(bh->b_folio); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1025,7 +1037,7 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); return ret; } @@ -1272,7 +1284,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; @@ -1356,7 +1368,7 @@ static int ext4_write_end(struct file *file, /* * This is a private version of page_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need - * to call ext4_handle_dirty_metadata() instead. + * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, @@ -3740,7 +3752,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); -- cgit v1.2.3 From 265e72efa99fcc0959f8d33d346a7e0f2e3fe201 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:34 +0200 Subject: ext4: Keep pages with journalled data dirty Currently we clear page dirty bit when we checkpoint some buffers from a page with journalled data or when we perform delayed dirtying of a page in ext4_writepages(). In a quest to simplify handling of journalled data we want to keep page dirty as long as it has either buffers to checkpoint or journalled dirty data. So make sure to keep page dirty in ext4_writepages() if it still has journalled data attached to it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 1 - fs/ext4/page-io.c | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5de0c04204c..473226783d00 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2393,7 +2393,6 @@ static int mpage_journal_page_buffers(handle_t *handle, int len; ClearPageChecked(page); - clear_page_dirty_for_io(page); mpd->wbc->nr_to_write--; if (page->index == size >> PAGE_SHIFT && diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 8fe1875b0a42..2e5e94219693 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -479,9 +479,11 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * to redirty the folio and keep TOWRITE tag so that * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for - * transaction commit. + * transaction commit or when journalled data is not + * yet committed. */ - if (buffer_dirty(bh)) { + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { if (!folio_test_dirty(folio)) folio_redirty_for_writepage(wbc, folio); keep_towrite = true; -- cgit v1.2.3 From 5e1bdea6391d09fde424a1406a04e01b208a04d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:35 +0200 Subject: ext4: Clear dirty bit from pages without data to write With journalled data it can happen that checkpointing code will write out page contents without clearing the page dirty bit. The logic in ext4_page_nomap_can_writeout() then results in us never calling mpage_submit_page() and thus clearing the dirty bit. Drop the optimization with ext4_page_nomap_can_writeout() and just always call to mpage_submit_page(). ext4_bio_write_page() knows when to redirty the page and the additional clearing & setting of page dirty bit for ordered mode writeout is not that expensive to jump through the hoops for it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 473226783d00..bca86f8f1594 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2348,19 +2348,6 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the folio needs to be written as part of transaction commit */ -static bool ext4_folio_nomap_can_writeout(struct folio *folio) -{ - struct buffer_head *bh, *head; - - bh = head = folio_buffers(folio); - do { - if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) - return true; - } while ((bh = bh->b_this_page) != head); - return false; -} - static int ext4_journal_page_buffers(handle_t *handle, struct page *page, int len) { @@ -2545,13 +2532,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * range operations before discarding the page cache. */ if (!mpd->can_map) { - if (ext4_folio_nomap_can_writeout(folio)) { - WARN_ON_ONCE(sb->s_writers.frozen == - SB_FREEZE_COMPLETE); - err = mpage_submit_folio(mpd, folio); - if (err < 0) - goto out; - } + WARN_ON_ONCE(sb->s_writers.frozen == + SB_FREEZE_COMPLETE); + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { WARN_ON_ONCE(sb->s_writers.frozen >= -- cgit v1.2.3 From 1f1a55f0bf069799edd5f21a99ac1e3b10ebafee Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:36 +0200 Subject: ext4: Commit transaction before writing back pages in data=journal mode When journalling data we currently just walk over pages, journal those that are marked for delayed dirtying (only pinned pages dirtied behing our back these days) and checkpoint other dirty pages. Because some pages may be part of running transaction the result is that after filemap_write_and_wait() we are not guaranteed pages are stable on disk. Thus places that want to flush current pagecache content need to jump through hoops to make sure journalled data is not lost. This is manageable in cases completely controlled by ext4 (such as extent shifting operations or inode eviction) but it gets ugly for stuff like fsverity. Furthermore it is rather error prone as people often do not realize journalled data needs special handling. So change ext4_writepages() to commit transaction with inode's data before going through the writeback loop in WB_SYNC_ALL mode. As a result filemap_write_and_wait() is now really getting pages to stable storage and makes pagecache pages safe to reclaim. Consequently we can remove the special handling of journalled data from several places in follow up patches. Note that this will make fsync(2) for journalled data more expensive as we will end up not only committing the transaction we need but also checkpointing the data (which we may have previously skipped if the data was part of the running transaction). If we really cared, we would need to introduce special VFS function for writing out & invalidating page cache for a range, use ->launder_page callback to perform checkpointing, and use it from all the places that need this functionality. But at this point I'm not convinced the complexity is worth it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bca86f8f1594..fb1b729ed1f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1568,6 +1568,7 @@ struct mpage_da_data { struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; + unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -2545,6 +2546,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd, &folio->page); if (err < 0) goto out; + mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { @@ -2634,10 +2636,23 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) /* * data=journal mode does not do delalloc so we just need to writeout / - * journal already mapped buffers + * journal already mapped buffers. On the other hand we need to commit + * transaction to make data stable. We expect all the data to be + * already in the journal (the only exception are DMA pinned pages + * dirtied behind our back) so we commit transaction here and run the + * writeback loop to checkpoint them. The checkpointing is not actually + * necessary to make data persistent *but* quite a few places (extent + * shifting operations, fsverity, ...) depend on being able to drop + * pagecache pages after calling filemap_write_and_wait() and for that + * checkpointing needs to happen. */ - if (ext4_should_journal_data(inode)) + if (ext4_should_journal_data(inode)) { mpd->can_map = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + ext4_fc_commit(sbi->s_journal, + EXT4_I(inode)->i_datasync_tid); + } + mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* @@ -2818,6 +2833,13 @@ static int ext4_writepages(struct address_space *mapping, percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); ret = ext4_do_writepages(&mpd); + /* + * For data=journal writeback we could have come across pages marked + * for delayed dirtying (PageChecked) which were just added to the + * running transaction. Try once more to get them to stable storage. + */ + if (!ret && mpd.journalled_more_data) + ret = ext4_do_writepages(&mpd); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); return ret; -- cgit v1.2.3 From 56c2a0e3d90d3822fab157883957523e327bc9ae Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:40 +0200 Subject: ext4: Drop special handling of journalled data from ext4_evict_inode() Now that ext4_writepages() makes sure journalled data is on stable storage, write_inode_now() call in iput_final() is enough to make pagecache pages with journalled data really clean (data committed and checkpointed). So we can drop special handling of journalled data in ext4_evict_inode(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fb1b729ed1f8..bb8ac3e0f784 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -179,33 +179,6 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { - /* - * When journalling data dirty buffers are tracked only in the - * journal. So although mm thinks everything is clean and - * ready for reaping the inode might still have some pages to - * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidate_folio() - * (via truncate_inode_pages()) to discard these buffers can - * cause data loss. Also even if we did not discard these - * buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to - * read them before the transaction is checkpointed. So be - * careful and force everything to disk here... We use - * ei->i_datasync_tid to store the newest transaction - * containing inode's data. - * - * Note that directories do not have this problem because they - * don't use page cache. - */ - if (inode->i_ino != EXT4_JOURNAL_INO && - ext4_should_journal_data(inode) && - S_ISREG(inode->i_mode) && inode->i_data.nrpages) { - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - - jbd2_complete_transaction(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } truncate_inode_pages_final(&inode->i_data); goto no_delete; -- cgit v1.2.3 From 951cafa6b80e55b966047b0c9cc5564df8b92145 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:42 +0200 Subject: ext4: Simplify handling of journalled data in ext4_bmap() Now that ext4_writepages() gets journalled data into its final location we just use filemap_write_and_wait() instead of special handling of journalled data in ext4_bmap(). We can also drop EXT4_STATE_JDATA flag as it is not used anymore. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-11-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/inode.c | 44 +++++--------------------------------------- 2 files changed, 5 insertions(+), 40 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 83f0cc02250f..271d9661ce82 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1886,7 +1886,6 @@ static inline void ext4_simulate_fail_bh(struct super_block *sb, * Inode dynamic state flags */ enum { - EXT4_STATE_JDATA, /* journaled data exists */ EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bb8ac3e0f784..d43beb886a30 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1414,7 +1414,6 @@ static int ext4_journalled_write_end(struct file *file, } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); @@ -2340,8 +2339,6 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); - return ret; } @@ -3085,9 +3082,7 @@ int ext4_alloc_da_blocks(struct inode *inode) static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - journal_t *journal; sector_t ret = 0; - int err; inode_lock_shared(inode); /* @@ -3097,45 +3092,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { + (test_opt(inode->i_sb, DELALLOC) || + ext4_should_journal_data(inode))) { /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file + * With delalloc or journalled data we want to sync the file so + * that we can make sure we allocate blocks for file and data + * is in place for the user to see it */ filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal, 0); - jbd2_journal_unlock_updates(journal); - - if (err) - goto out; - } - ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: -- cgit v1.2.3 From ab382539adcb43f52d984abf58d8e3459cd707a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:43 +0200 Subject: ext4: Update comment in mpage_prepare_extent_to_map() Since filemap_write_and_wait() is now enough to get journalled data to final location update the comment in mpage_prepare_extent_to_map(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-12-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d43beb886a30..c1888cd2a48f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2496,11 +2496,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This - * is crutial so that forcing a transaction commit and - * then calling filemap_write_and_wait() guarantees - * current state of data is in its final location. Such - * sequence is used for example by insert/collapse - * range operations before discarding the page cache. + * makes sure current data is checkpointed to the final + * location before possibly journalling it again which + * is desirable when the page is frequently dirtied + * through a pin. */ if (!mpd->can_map) { WARN_ON_ONCE(sb->s_writers.frozen == -- cgit v1.2.3 From d0ab8368c175f7b5ef0851283a2ba362a9ab327a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:44 +0200 Subject: Revert "ext4: Fix warnings when freezing filesystem with journaled data" After making ext4_writepages() properly clean all pages there is no need for special treatment of filesystem freezing. Revert commit e6c28a26b799c7640b77daff3e4a67808c74381c. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-13-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 +-------------- fs/ext4/super.c | 11 ----------- 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c1888cd2a48f..8dbd352e3986 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2385,7 +2385,6 @@ static int mpage_journal_page_buffers(handle_t *handle, static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct super_block *sb = mpd->inode->i_sb; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; @@ -2405,15 +2404,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->next_page = index; - /* - * Start a transaction for writeback of journalled data. We don't start - * the transaction if the filesystem is frozen. In that case we - * should not have any dirty data to write anymore but possibly there - * are stray page dirty bits left by the checkpointing code so this - * loop clears them. - */ - if (ext4_should_journal_data(mpd->inode) && - sb->s_writers.frozen < SB_FREEZE_FS) { + if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) @@ -2502,15 +2493,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * through a pin. */ if (!mpd->can_map) { - WARN_ON_ONCE(sb->s_writers.frozen == - SB_FREEZE_COMPLETE); err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { - WARN_ON_ONCE(sb->s_writers.frozen >= - SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, mpd, &folio->page); if (err < 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a876ebd534a8..690faf766d23 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6293,17 +6293,6 @@ static int ext4_freeze(struct super_block *sb) if (error < 0) goto out; - /* - * Do another sync. We really should not have any dirty data - * anymore but our checkpointing code does not clear page dirty - * bits due to locking constraints so writeback still can get - * started for inodes with journalled data which triggers - * annoying warnings. - */ - error = sync_filesystem(sb); - if (error < 0) - goto out; - /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) -- cgit v1.2.3 From 1dedde690303c05ef732b7c5c8356fdf60a4ade3 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 21 Mar 2023 09:37:21 +0800 Subject: ext4: fix i_disksize exceeding i_size problem in paritally written case It is possible for i_disksize can exceed i_size, triggering a warning. generic_perform_write copied = iov_iter_copy_from_user_atomic(len) // copied < len ext4_da_write_end | ext4_update_i_disksize | new_i_size = pos + copied; | WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize) // update i_disksize | generic_write_end | copied = block_write_end(copied, len) // copied = 0 | if (unlikely(copied < len)) | if (!PageUptodate(page)) | copied = 0; | if (pos + copied > inode->i_size) // return false if (unlikely(copied == 0)) goto again; if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } We get i_disksize greater than i_size here, which could trigger WARNING check 'i_size_read(inode) < EXT4_I(inode)->i_disksize' while doing dio: ext4_dio_write_iter iomap_dio_rw __iomap_dio_rw // return err, length is not aligned to 512 ext4_handle_inode_extension WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize) // Oops WARNING: CPU: 2 PID: 2609 at fs/ext4/file.c:319 CPU: 2 PID: 2609 Comm: aa Not tainted 6.3.0-rc2 RIP: 0010:ext4_file_write_iter+0xbc7 Call Trace: vfs_write+0x3b1 ksys_write+0x77 do_syscall_64+0x39 Fix it by updating 'copied' value before updating i_disksize just like ext4_write_inline_data_end() does. A reproducer can be found in the buganizer link below. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217209 Fixes: 64769240bd07 ("ext4: Add delayed allocation support in data=writeback mode") Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230321013721.89818-1-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dbd352e3986..1cfbb929c694 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2982,6 +2982,9 @@ static int ext4_da_write_end(struct file *file, ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; -- cgit v1.2.3 From 00d873c17e29cc32d90ca852b82685f1673acaa5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 May 2023 14:47:23 +0200 Subject: ext4: avoid deadlock in fs reclaim with page writeback Ext4 has a filesystem wide lock protecting ext4_writepages() calls to avoid races with switching of journalled data flag or inode format. This lock can however cause a deadlock like: CPU0 CPU1 ext4_writepages() percpu_down_read(sbi->s_writepages_rwsem); ext4_change_inode_journal_flag() percpu_down_write(sbi->s_writepages_rwsem); - blocks, all readers block from now on ext4_do_writepages() ext4_init_io_end() kmem_cache_zalloc(io_end_cachep, GFP_KERNEL) fs_reclaim frees dentry... dentry_unlink_inode() iput() - last ref => iput_final() - inode dirty => write_inode_now()... ext4_writepages() tries to acquire sbi->s_writepages_rwsem and blocks forever Make sure we cannot recurse into filesystem reclaim from writeback code to avoid the deadlock. Reported-by: syzbot+6898da502aef574c5f8a@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000004c66b405fa108e27@google.com Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230504124723.20205-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 24 ++++++++++++++++++++++++ fs/ext4/inode.c | 18 ++++++++++-------- fs/ext4/migrate.c | 11 ++++++----- 3 files changed, 40 insertions(+), 13 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7e8f66ba17f4..6948d673bba2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1684,6 +1684,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d5ba922e411..3cb774d9e3f1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2783,11 +2783,12 @@ static int ext4_writepages(struct address_space *mapping, .can_map = 1, }; int ret; + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; - percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked @@ -2796,7 +2797,7 @@ static int ext4_writepages(struct address_space *mapping, */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); - percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + ext4_writepages_up_read(sb, alloc_ctx); return ret; } @@ -2824,17 +2825,18 @@ static int ext4_dax_writepages(struct address_space *mapping, long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } @@ -5928,7 +5930,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int alloc_ctx; /* * We have to be very careful here: changing a data block's @@ -5966,7 +5968,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* @@ -5983,7 +5985,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -5991,7 +5993,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a19a9661646e..d98ac2af8199 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -408,7 +408,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) int ext4_ext_migrate(struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -418,6 +417,7 @@ int ext4_ext_migrate(struct inode *inode) unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; + int alloc_ctx; /* * If the filesystem does not support extents, or the inode @@ -434,7 +434,7 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block @@ -586,7 +586,7 @@ out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } @@ -605,6 +605,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; + int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -621,7 +622,7 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { @@ -665,6 +666,6 @@ errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; } -- cgit v1.2.3 From fa83c34e3e56b3c672af38059e066242655271b1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 5 May 2023 21:24:29 +0800 Subject: ext4: check iomap type only if ext4_iomap_begin() does not fail When ext4_iomap_overwrite_begin() calls ext4_iomap_begin() map blocks may fail for some reason (e.g. memory allocation failure, bare disk write), and later because "iomap->type ! = IOMAP_MAPPED" triggers WARN_ON(). When ext4 iomap_begin() returns an error, it is normal that the type of iomap->type may not match the expectation. Therefore, we only determine if iomap->type is as expected when ext4_iomap_begin() is executed successfully. Cc: stable@kernel.org Reported-by: syzbot+08106c4b7d60702dbc14@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/00000000000015760b05f9b4eee9@google.com Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230505132429.714648-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3cb774d9e3f1..ce5f21b6c2b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3377,7 +3377,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } -- cgit v1.2.3 From b3e6bcb94590dea45396b9481e47b809b1be4afa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:48 -0400 Subject: ext4: add EA_INODE checking to ext4_iget() Add a new flag, EXT4_IGET_EA_INODE which indicates whether the inode is expected to have the EA_INODE flag or not. If the flag is not set/clear as expected, then fail the iget() operation and mark the file system as corrupted. This commit also makes the ext4_iget() always perform the is_bad_inode() check even when the inode is already inode cache. This allows us to remove the is_bad_inode() check from the callers of ext4_iget() in the ea_inode code. Reported-by: syzbot+cbb68193bdb95af4340a@syzkaller.appspotmail.com Reported-by: syzbot+62120febbd1ee3c3c860@syzkaller.appspotmail.com Reported-by: syzbot+edce54daffee36421b4c@syzkaller.appspotmail.com Cc: stable@kernel.org Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-2-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 31 ++++++++++++++++++++++++++----- fs/ext4/xattr.c | 36 +++++++----------------------------- 3 files changed, 35 insertions(+), 35 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6948d673bba2..9525c52b78dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2901,7 +2901,8 @@ typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ - EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ + EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ + EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce5f21b6c2b3..258f3cbed347 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4641,6 +4641,21 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } +static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) + +{ + if (flags & EXT4_IGET_EA_INODE) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return "missing EA_INODE flag"; + } else { + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return "unexpected EA_INODE flag"; + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) + return "unexpected bad inode w/o EXT4_IGET_BAD"; + return NULL; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4650,6 +4665,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; + const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4677,8 +4693,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->i_state & I_NEW)) { + if ((err_str = check_igot_inode(inode, flags)) != NULL) { + ext4_error_inode(inode, function, line, 0, err_str); + iput(inode); + return ERR_PTR(-EFSCORRUPTED); + } return inode; + } ei = EXT4_I(inode); iloc.bh = NULL; @@ -4944,10 +4966,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { - ext4_error_inode(inode, function, line, 0, - "bad inode without EXT4_IGET_BAD flag"); - ret = -EUCLEAN; + if ((err_str = check_igot_inode(inode, flags)) != NULL) { + ext4_error_inode(inode, function, line, 0, err_str); + ret = -EFSCORRUPTED; goto bad_inode; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index dfc2e223bd10..a27208129a80 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -433,7 +433,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, return -EFSCORRUPTED; } - inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -441,23 +441,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, err); return err; } - - if (is_bad_inode(inode)) { - ext4_error(parent->i_sb, - "error while reading EA inode %lu is_bad_inode", - ea_ino); - err = -EIO; - goto error; - } - - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { - ext4_error(parent->i_sb, - "EA inode %lu does not have EXT4_EA_INODE_FL flag", - ea_ino); - err = -EINVAL; - goto error; - } - ext4_xattr_inode_set_class(inode); /* @@ -478,9 +461,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, *ea_inode = inode; return 0; -error: - iput(inode); - return err; } /* Remove entry from mbcache when EA inode is getting evicted */ @@ -1556,11 +1536,10 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, - EXT4_IGET_NORMAL); - if (!IS_ERR(ea_inode) && - !is_bad_inode(ea_inode) && - (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && - i_size_read(ea_inode) == value_len && + EXT4_IGET_EA_INODE); + if (IS_ERR(ea_inode)) + goto next_entry; + if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && @@ -1570,9 +1549,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, kvfree(ea_data); return ea_inode; } - - if (!IS_ERR(ea_inode)) - iput(ea_inode); + iput(ea_inode); + next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); -- cgit v1.2.3 From 2bc7e7c1a3bc9bd0cbf0f71006f6fe7ef24a00c2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:50 -0400 Subject: ext4: disallow ea_inodes with extended attributes An ea_inode stores the value of an extended attribute; it can not have extended attributes itself, or this will cause recursive nightmares. Add a check in ext4_iget() to make sure this is the case. Cc: stable@kernel.org Reported-by: syzbot+e44749b6ba4d0434cd47@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-4-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 258f3cbed347..02de439bf1f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4647,6 +4647,9 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "missing EA_INODE flag"; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + EXT4_I(inode)->i_file_acl) + return "ea_inode with extended attributes"; } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "unexpected EA_INODE flag"; -- cgit v1.2.3