From 23e3d7f7061f8682c751c46512718f47580ad8f0 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Mar 2022 22:21:37 +0800 Subject: jbd2: fix a potential race while discarding reserved buffers after an abort we got issue as follows: [ 72.796117] EXT4-fs error (device sda): ext4_journal_check_start:83: comm fallocate: Detected aborted journal [ 72.826847] EXT4-fs (sda): Remounting filesystem read-only fallocate: fallocate failed: Read-only file system [ 74.791830] jbd2_journal_commit_transaction: jh=0xffff9cfefe725d90 bh=0x0000000000000000 end delay [ 74.793597] ------------[ cut here ]------------ [ 74.794203] kernel BUG at fs/jbd2/transaction.c:2063! [ 74.794886] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 74.795533] CPU: 4 PID: 2260 Comm: jbd2/sda-8 Not tainted 5.17.0-rc8-next-20220315-dirty #150 [ 74.798327] RIP: 0010:__jbd2_journal_unfile_buffer+0x3e/0x60 [ 74.801971] RSP: 0018:ffffa828c24a3cb8 EFLAGS: 00010202 [ 74.802694] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 74.803601] RDX: 0000000000000001 RSI: ffff9cfefe725d90 RDI: ffff9cfefe725d90 [ 74.804554] RBP: ffff9cfefe725d90 R08: 0000000000000000 R09: ffffa828c24a3b20 [ 74.805471] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9cfefe725d90 [ 74.806385] R13: ffff9cfefe725d98 R14: 0000000000000000 R15: ffff9cfe833a4d00 [ 74.807301] FS: 0000000000000000(0000) GS:ffff9d01afb00000(0000) knlGS:0000000000000000 [ 74.808338] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 74.809084] CR2: 00007f2b81bf4000 CR3: 0000000100056000 CR4: 00000000000006e0 [ 74.810047] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 74.810981] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 74.811897] Call Trace: [ 74.812241] [ 74.812566] __jbd2_journal_refile_buffer+0x12f/0x180 [ 74.813246] jbd2_journal_refile_buffer+0x4c/0xa0 [ 74.813869] jbd2_journal_commit_transaction.cold+0xa1/0x148 [ 74.817550] kjournald2+0xf8/0x3e0 [ 74.819056] kthread+0x153/0x1c0 [ 74.819963] ret_from_fork+0x22/0x30 Above issue may happen as follows: write truncate kjournald2 generic_perform_write ext4_write_begin ext4_walk_page_buffers do_journal_get_write_access ->add BJ_Reserved list ext4_journalled_write_end ext4_walk_page_buffers write_end_fn ext4_handle_dirty_metadata ***************JBD2 ABORT************** jbd2_journal_dirty_metadata -> return -EROFS, jh in reserved_list jbd2_journal_commit_transaction while (commit_transaction->t_reserved_list) jh = commit_transaction->t_reserved_list; truncate_pagecache_range do_invalidatepage ext4_journalled_invalidatepage jbd2_journal_invalidatepage journal_unmap_buffer __dispose_buffer __jbd2_journal_unfile_buffer jbd2_journal_put_journal_head ->put last ref_count __journal_remove_journal_head bh->b_private = NULL; jh->b_bh = NULL; jbd2_journal_refile_buffer(journal, jh); bh = jh2bh(jh); ->bh is NULL, later will trigger null-ptr-deref journal_free_journal_head(jh); After commit 96f1e0974575, we no longer hold the j_state_lock while iterating over the list of reserved handles in jbd2_journal_commit_transaction(). This potentially allows the journal_head to be freed by journal_unmap_buffer while the commit codepath is also trying to free the BJ_Reserved buffers. Keeping j_state_lock held while trying extends hold time of the lock minimally, and solves this issue. Fixes: 96f1e0974575("jbd2: avoid long hold times of j_state_lock while committing a transaction") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220317142137.1821590-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5b9408e3b370..ac7f067b7bdd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_wait_updates(journal); commit_transaction->t_state = T_SWITCH; - write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially -- cgit v1.2.3 From 731222557a69003bb27280b0750183803fa79770 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 00:52:35 -0400 Subject: jbd2: Convert release_buffer_page() to use a folio Saves a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Theodore Ts'o Reviewed-by: Jeff Layton --- fs/jbd2/commit.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ac7f067b7bdd..2f37108da0ec 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -62,6 +62,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) */ static void release_buffer_page(struct buffer_head *bh) { + struct folio *folio; struct page *page; if (buffer_dirty(bh)) @@ -71,18 +72,19 @@ static void release_buffer_page(struct buffer_head *bh) page = bh->b_page; if (!page) goto nope; - if (page->mapping) + folio = page_folio(page); + if (folio->mapping) goto nope; /* OK, it's a truncated page */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto nope; - get_page(page); + folio_get(folio); __brelse(bh); - try_to_free_buffers(page); - unlock_page(page); - put_page(page); + try_to_free_buffers(&folio->page); + folio_unlock(folio); + folio_put(folio); return; nope: -- cgit v1.2.3 From 68189fef88c7d02eb92e038be3d6428ebd0d2945 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 01:08:08 -0400 Subject: fs: Change try_to_free_buffers() to take a folio All but two of the callers already have a folio; pass a folio into try_to_free_buffers(). This removes the last user of cancel_dirty_page() so remove that wrapper function too. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- fs/buffer.c | 42 +++++++++++++++++++++--------------------- fs/ext4/inode.c | 2 +- fs/gfs2/aops.c | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/jbd2/commit.c | 2 +- fs/jbd2/transaction.c | 4 ++-- fs/mpage.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/reiserfs/journal.c | 2 +- include/linux/buffer_head.h | 4 ++-- include/linux/pagemap.h | 4 ---- mm/filemap.c | 2 +- mm/migrate.c | 2 +- mm/vmscan.c | 2 +- 16 files changed, 37 insertions(+), 41 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/buffer.c b/fs/buffer.c index 786ef5b98c80..701af0035802 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -955,7 +955,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, size); goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page_folio(page))) goto failed; } @@ -3155,20 +3155,20 @@ int sync_dirty_buffer(struct buffer_head *bh) EXPORT_SYMBOL(sync_dirty_buffer); /* - * try_to_free_buffers() checks if all the buffers on this particular page + * try_to_free_buffers() checks if all the buffers on this particular folio * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the page or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's private_lock. * - * If the page is dirty but all the buffers are clean then we need to - * be sure to mark the page clean as well. This is because the page + * If the folio is dirty but all the buffers are clean then we need to + * be sure to mark the folio clean as well. This is because the folio * may be against a block device, and a later reattachment of buffers - * to a dirty page will set *all* buffers dirty. Which would corrupt + * to a dirty folio will set *all* buffers dirty. Which would corrupt * filesystem data on the same device. * - * The same applies to regular filesystem pages: if all the buffers are - * clean then we set the page clean and proceed. To do that, we require + * The same applies to regular filesystem folios: if all the buffers are + * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with * private_lock. * @@ -3207,40 +3207,40 @@ failed: return 0; } -int try_to_free_buffers(struct page *page) +bool try_to_free_buffers(struct folio *folio) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; struct buffer_head *buffers_to_free = NULL; - int ret = 0; + bool ret = 0; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping == NULL) { /* can this still happen? */ - ret = drop_buffers(page, &buffers_to_free); + ret = drop_buffers(&folio->page, &buffers_to_free); goto out; } spin_lock(&mapping->private_lock); - ret = drop_buffers(page, &buffers_to_free); + ret = drop_buffers(&folio->page, &buffers_to_free); /* * If the filesystem writes its buffers by hand (eg ext3) - * then we can have clean buffers against a dirty page. We - * clean the page here; otherwise the VM will never notice + * then we can have clean buffers against a dirty folio. We + * clean the folio here; otherwise the VM will never notice * that the filesystem did any IO at all. * * Also, during truncate, discard_buffer will have marked all - * the page's buffers clean. We discover that here and clean - * the page also. + * the folio's buffers clean. We discover that here and clean + * the folio also. * * private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) - cancel_dirty_page(page); + folio_cancel_dirty(folio); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 943937cb5302..987ea77e672d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3255,7 +3255,7 @@ static bool ext4_release_folio(struct folio *folio, gfp_t wait) if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else - return try_to_free_buffers(&folio->page); + return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 95a674d70c04..106e90a36583 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -757,7 +757,7 @@ bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask) } while (bh != head); gfs2_log_unlock(sdp); - return try_to_free_buffers(&folio->page); + return try_to_free_buffers(folio); cannot_release: gfs2_log_unlock(sdp); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 86fd50e5fccb..c4526f16355d 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -124,7 +124,7 @@ static bool hfs_release_folio(struct folio *folio, gfp_t mask) } while (--i && nidx < tree->node_count); spin_unlock(&tree->hash_lock); } - return res ? try_to_free_buffers(&folio->page) : false; + return res ? try_to_free_buffers(folio) : false; } static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f723e0e91d51..aeab83ed1c9c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -121,7 +121,7 @@ static bool hfsplus_release_folio(struct folio *folio, gfp_t mask) } while (--i && nidx < tree->node_count); spin_unlock(&tree->hash_lock); } - return res ? try_to_free_buffers(&folio->page) : false; + return res ? try_to_free_buffers(folio) : false; } static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2f37108da0ec..eb315e81f1a6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -82,7 +82,7 @@ static void release_buffer_page(struct buffer_head *bh) folio_get(folio); __brelse(bh); - try_to_free_buffers(&folio->page); + try_to_free_buffers(folio); folio_unlock(folio); folio_put(folio); return; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ee33d277d51e..e49bb0938376 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2175,7 +2175,7 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) goto busy; } while ((bh = bh->b_this_page) != head); - ret = try_to_free_buffers(&folio->page); + ret = try_to_free_buffers(folio); busy: return ret; } @@ -2482,7 +2482,7 @@ int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio, } while (bh != head); if (!partial_page) { - if (may_free && try_to_free_buffers(&folio->page)) + if (may_free && try_to_free_buffers(folio)) J_ASSERT(!folio_buffers(folio)); } return 0; diff --git a/fs/mpage.c b/fs/mpage.c index 6df9c3aa5728..0d25f44f5707 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -431,7 +431,7 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) * disk before we reach the platter. */ if (buffer_heads_over_limit && PageUptodate(page)) - try_to_free_buffers(page); + try_to_free_buffers(page_folio(page)); } /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7d7b86ca078f..35d40a67204c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -502,7 +502,7 @@ static bool ocfs2_release_folio(struct folio *folio, gfp_t wait) { if (!folio_buffers(folio)) return false; - return try_to_free_buffers(&folio->page); + return try_to_free_buffers(folio); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9cf2e1420a74..0cffe054b78e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3234,7 +3234,7 @@ static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) bh = bh->b_this_page; } while (bh != head); if (ret) - ret = try_to_free_buffers(&folio->page); + ret = try_to_free_buffers(folio); spin_unlock(&j->j_dirty_buffers_lock); return ret; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 99ba495b0f28..d8cc9a366124 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -606,7 +606,7 @@ static void release_buffer_page(struct buffer_head *bh) folio_get(folio); put_bh(bh); if (!folio->mapping) - try_to_free_buffers(&folio->page); + try_to_free_buffers(folio); folio_unlock(folio); folio_put(folio); } else { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 31d82fd9abe8..c9d1463bb20f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -158,7 +158,7 @@ void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -int try_to_free_buffers(struct page *); +bool try_to_free_buffers(struct folio *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, @@ -402,7 +402,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #else /* CONFIG_BLOCK */ static inline void buffer_init(void) {} -static inline int try_to_free_buffers(struct page *page) { return 1; } +static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 831b28dab01a..82dfb279e0c4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1067,10 +1067,6 @@ static inline void folio_cancel_dirty(struct folio *folio) if (folio_test_dirty(folio)) __folio_cancel_dirty(folio); } -static inline void cancel_dirty_page(struct page *page) -{ - folio_cancel_dirty(page_folio(page)); -} bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); diff --git a/mm/filemap.c b/mm/filemap.c index ee892853a214..d335a154a0d9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3957,6 +3957,6 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); - return try_to_free_buffers(&folio->page); + return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/migrate.c b/mm/migrate.c index 6c31ee1e1c9b..21d82636c291 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1013,7 +1013,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (!page->mapping) { VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { - try_to_free_buffers(page); + try_to_free_buffers(folio); goto out_unlock_both; } } else if (page_mapped(page)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 27851232e00c..f3f7ce2c4068 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1181,7 +1181,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping) * folio->mapping == NULL while being dirty with clean buffers. */ if (folio_test_private(folio)) { - if (try_to_free_buffers(&folio->page)) { + if (try_to_free_buffers(folio)) { folio_clear_dirty(folio); pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN; -- cgit v1.2.3