From f6a6c280059c4ddc23e12e3de1b01098e240036f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 26 Aug 2025 11:24:38 -0700 Subject: btrfs: fix subvolume deletion lockup caused by inodes xarray race There is a race condition between inode eviction and inode caching that can cause a live struct btrfs_inode to be missing from the root->inodes xarray. Specifically, there is a window during evict() between the inode being unhashed and deleted from the xarray. If btrfs_iget() is called for the same inode in that window, it will be recreated and inserted into the xarray, but then eviction will delete the new entry, leaving nothing in the xarray: Thread 1 Thread 2 --------------------------------------------------------------- evict() remove_inode_hash() btrfs_iget_path() btrfs_iget_locked() btrfs_read_locked_inode() btrfs_add_inode_to_root() destroy_inode() btrfs_destroy_inode() btrfs_del_inode_from_root() __xa_erase In turn, this can cause issues for subvolume deletion. Specifically, if an inode is in this lost state, and all other inodes are evicted, then btrfs_del_inode_from_root() will call btrfs_add_dead_root() prematurely. If the lost inode has a delayed_node attached to it, then when btrfs_clean_one_deleted_snapshot() calls btrfs_kill_all_delayed_nodes(), it will loop forever because the delayed_nodes xarray will never become empty (unless memory pressure forces the inode out). We saw this manifest as soft lockups in production. Fix it by only deleting the xarray entry if it matches the given inode (using __xa_cmpxchg()). Fixes: 310b2f5d5a94 ("btrfs: use an xarray to track open inodes in a root") Cc: stable@vger.kernel.org # 6.11+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Co-authored-by: Leo Martins Signed-off-by: Leo Martins Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7..e7218e78bff4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5696,7 +5696,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); -- cgit v1.2.3 From 8679d2687c351824d08cf1f0e86f3b65f22a00fe Mon Sep 17 00:00:00 2001 From: austinchang Date: Thu, 11 Sep 2025 06:06:29 +0000 Subject: btrfs: initialize inode::file_extent_tree after i_mode has been set btrfs_init_file_extent_tree() uses S_ISREG() to determine if the file is a regular file. In the beginning of btrfs_read_locked_inode(), the i_mode hasn't been read from inode item, then file_extent_tree won't be used at all in volumes without NO_HOLES. Fix this by calling btrfs_init_file_extent_tree() after i_mode is initialized in btrfs_read_locked_inode(). Fixes: 3d7db6e8bd22e6 ("btrfs: don't allocate file extent tree for non regular files") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Filipe Manana Signed-off-by: austinchang Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 --- fs/btrfs/inode.c | 11 +++++------ 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143..c0c1ddd46b67 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; @@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e7218e78bff4..18db1053cdf0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any -- cgit v1.2.3