From d46947284496e5dd1d5f6850790ec623a482b63a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Feb 2023 11:57:20 -0500 Subject: btrfs: replace BUG_ON with ASSERT in btrfs_read_node_slot In btrfs_read_node_slot() we have a BUG_ON() that can be converted to an ASSERT(), it's from an extent buffer and the level is validated at the time it's read from disk. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a5b6bb54545f..a1c109d7956a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -959,7 +959,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); - BUG_ON(level == 0); + ASSERT(level); check.level = level - 1; check.transid = btrfs_node_ptr_generation(parent, slot); -- cgit v1.2.3 From 9cf14029d5fb42126b332aea708b787be9a5079e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Feb 2023 11:57:21 -0500 Subject: btrfs: handle errors from btrfs_read_node_slot in split While investigating a problem with error injection I tripped over curious behavior in the node/leaf splitting code. If we get an EIO when trying to read either the left or right leaf/node for splitting we'll simply treat the node as if it were full and continue on. The end result of this isn't too bad, we simply end up allocating a block when we may have pushed items into the adjacent blocks. However this does essentially allow us to continue to modify a file system that we've gotten errors on, either from a bad disk or csum mismatch or other corruption. This isn't particularly safe, so instead handle these btrfs_read_node_slot() usages differently. We allow you to pass in any slot, the idea being that we save some code if the slot number is outside of the range of the parent. This means we treat all errors the same, when in reality we only want to ignore -ENOENT. Fix this by changing how we call btrfs_read_node_slot(), which is to only call it for slots we know are valid. This way if we get an error back from reading the block we can properly pass the error up the chain. This was validated with the error injection testing I was doing. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a1c109d7956a..e1045e6d5e84 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1064,11 +1064,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; + if (pslot) { + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) { + ret = PTR_ERR(left); + left = NULL; + goto enospc; + } - if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, @@ -1079,11 +1082,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; + if (pslot + 1 < btrfs_header_nritems(parent)) { + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) { + ret = PTR_ERR(right); + right = NULL; + goto enospc; + } - if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, @@ -1240,14 +1246,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; - /* first, try to make some room in the middle buffer */ - if (left) { + if (pslot) { u32 left_nr; + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); @@ -1292,16 +1298,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; /* * then try to empty the right most buffer into the middle */ - if (right) { + if (pslot + 1 < btrfs_header_nritems(parent)) { u32 right_nr; + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); @@ -3198,12 +3205,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); - /* - * slot + 1 is not valid or we fail to read the right node, - * no big deal, just return. - */ if (IS_ERR(right)) - return 1; + return PTR_ERR(right); __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -3417,12 +3420,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); - /* - * slot - 1 is not valid or we fail to read the left node, - * no big deal, just return. - */ if (IS_ERR(left)) - return 1; + return PTR_ERR(left); __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); -- cgit v1.2.3 From fdf8d595f49c79f1c47e5a91c4c6582572c5eeee Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 24 Feb 2023 11:31:26 +0800 Subject: btrfs: open code btrfs_bin_search() btrfs_bin_search() is a simple wrapper that searches for the whole slots by calling btrfs_generic_bin_search() with the starting slot/first_slot preset to 0. This simple wrapper can be open coded as btrfs_bin_search(). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 13 +++++++------ fs/btrfs/ctree.h | 17 ++--------------- fs/btrfs/relocation.c | 6 +++--- fs/btrfs/tree-log.c | 2 +- 4 files changed, 13 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e1045e6d5e84..3b956176b038 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. - * Use a value of 0 to search over the whole extent buffer. + * Use a value of 0 to search over the whole extent buffer. Works for both + * leaves and nodes. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot) +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; @@ -1871,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return btrfs_generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -2328,7 +2329,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = btrfs_bin_search(b, key, &slot); + ret = btrfs_bin_search(b, 0, key, &slot); if (ret < 0) goto done; @@ -4575,7 +4576,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = btrfs_bin_search(cur, min_key, &slot); + sret = btrfs_bin_search(cur, 0, min_key, &slot); if (sret < 0) { ret = sret; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97897107fab5..4c1986cd5bed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot); +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -static inline int btrfs_bin_search(struct extent_buffer *eb, - const struct btrfs_key *key, - int *slot) -{ - return btrfs_generic_bin_search(eb, 0, key, slot); -} - -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef13a9d4e370..09b1988d1791 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1266,7 +1266,7 @@ again: level = btrfs_header_level(parent); ASSERT(level >= lowest_level); - ret = btrfs_bin_search(parent, &key, &slot); + ret = btrfs_bin_search(parent, 0, &key, &slot); if (ret < 0) break; if (ret && slot > 0) @@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); @@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, slot = path->slots[upper->level]; btrfs_release_path(path); } else { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 200cea6e49e5..9ab793b638a7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4099,7 +4099,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); + ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); if (ret < 0) break; -- cgit v1.2.3 From 524f14bb114a20d9b3d8db25f93b532d4207fcac Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Apr 2023 18:52:23 +0100 Subject: btrfs: remove pointless loop at btrfs_get_next_valid_item() It's pointless to have a while loop at btrfs_get_next_valid_item(), as if the slot on the current leaf is beyond the last item, we call btrfs_next_leaf(), which leaves us at a valid slot of the next leaf (or a valid slot in the current leaf if after releasing the path an item gets pushed from the next leaf to the current leaf). So just call btrfs_next_leaf() if the current slot on the current leaf is beyond the last item. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3b956176b038..3c983c70028a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2490,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path) { - while (1) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { int ret; - const int slot = path->slots[0]; - const struct extent_buffer *leaf = path->nodes[0]; - /* This is where we start walking the path. */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * If we've reached the last slot in this leaf we need - * to go to the next leaf and reset the path. - */ - ret = btrfs_next_leaf(root, path); - if (ret) - return ret; - continue; - } - /* Store the found, valid item in @key. */ - btrfs_item_key_to_cpu(leaf, key, slot); - break; + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; } + + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); return 0; } -- cgit v1.2.3 From 6f932d4ef007d6a4ae03badcb749fbb8f49196f6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 12 Apr 2023 11:33:09 +0100 Subject: btrfs: fix btrfs_prev_leaf() to not return the same key twice A call to btrfs_prev_leaf() may end up returning a path that points to the same item (key) again. This happens if while btrfs_prev_leaf(), after we release the path, a concurrent insertion happens, which moves items off from a sibling into the front of the previous leaf, and an item with the computed previous key does not exists. For example, suppose we have the two following leaves: Leaf A ------------------------------------------------------------- | ... key (300 96 10) key (300 96 15) key (300 96 16) | ------------------------------------------------------------- slot 20 slot 21 slot 22 Leaf B ------------------------------------------------------------- | key (300 96 20) key (300 96 21) key (300 96 22) ... | ------------------------------------------------------------- slot 0 slot 1 slot 2 If we call btrfs_prev_leaf(), from btrfs_previous_item() for example, with a path pointing to leaf B and slot 0 and the following happens: 1) At btrfs_prev_leaf() we compute the previous key to search as: (300 96 19), which is a key that does not exists in the tree; 2) Then we call btrfs_release_path() at btrfs_prev_leaf(); 3) Some other task inserts a key at leaf A, that sorts before the key at slot 20, for example it has an objectid of 299. In order to make room for the new key, the key at slot 22 is moved to the front of leaf B. This happens at push_leaf_right(), called from split_leaf(). After this leaf B now looks like: -------------------------------------------------------------------------------- | key (300 96 16) key (300 96 20) key (300 96 21) key (300 96 22) ... | -------------------------------------------------------------------------------- slot 0 slot 1 slot 2 slot 3 4) At btrfs_prev_leaf() we call btrfs_search_slot() for the computed previous key: (300 96 19). Since the key does not exists, btrfs_search_slot() returns 1 and with a path pointing to leaf B and slot 1, the item with key (300 96 20); 5) This makes btrfs_prev_leaf() return a path that points to slot 1 of leaf B, the same key as before it was called, since the key at slot 0 of leaf B (300 96 16) is less than the computed previous key, which is (300 96 19); 6) As a consequence btrfs_previous_item() returns a path that points again to the item with key (300 96 20). For some users of btrfs_prev_leaf() or btrfs_previous_item() this may not be functional a problem, despite not making sense to return a new path pointing again to the same item/key. However for a caller such as tree-log.c:log_dir_items(), this has a bad consequence, as it can result in not logging some dir index deletions in case the directory is being logged without holding the inode's VFS lock (logging triggered while logging a child inode for example) - for the example scenario above, in case the dir index keys 17, 18 and 19 were deleted in the current transaction. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3c983c70028a..16414bd9e496 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4478,10 +4478,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; + struct btrfs_key orig_key; struct btrfs_disk_key found_key; int ret; btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; if (key.offset > 0) { key.offset--; @@ -4498,8 +4500,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret <= 0) return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); /* -- cgit v1.2.3 From 9ae5afd02a03d4e22a17a9609b19400b77c36273 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Apr 2023 11:51:35 +0100 Subject: btrfs: abort transaction when sibling keys check fails for leaves If the sibling keys check fails before we move keys from one sibling leaf to another, we are not aborting the transaction - we leave that to some higher level caller of btrfs_search_slot() (or anything else that uses it to insert items into a b+tree). This means that the transaction abort will provide a stack trace that omits the b+tree modification call chain. So change this to immediately abort the transaction and therefore get a more useful stack trace that shows us the call chain in the bt+tree modification code. It's also important to immediately abort the transaction just in case some higher level caller is not doing it, as this indicates a very serious corruption and we should stop the possibility of doing further damage. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 16414bd9e496..adfc04bd362e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3215,6 +3215,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (check_sibling_keys(left, right)) { ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); free_extent_buffer(right); return ret; @@ -3433,6 +3434,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (check_sibling_keys(left, right)) { ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); goto out; } return __push_leaf_left(trans, path, min_data_size, empty, left, -- cgit v1.2.3 From a2cea677db6099d71c9f70de7f907d3d7e6bec3b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Apr 2023 11:51:36 +0100 Subject: btrfs: print extent buffers when sibling keys check fails When trying to move keys from one node/leaf to another sibling node/leaf, if the sibling keys check fails we just print an error message with the last key of the left sibling and the first key of the right sibling. However it's also useful to print all the keys of each sibling, as it may provide some clues to what went wrong, which code path may be inserting keys in an incorrect order. So just do that, print the siblings with btrfs_print_tree(), as it works for both leaves and nodes. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/ctree.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index adfc04bd362e..2ff2961b1183 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2627,6 +2627,10 @@ static bool check_sibling_keys(struct extent_buffer *left, } if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, "left extent buffer:"); + btrfs_print_tree(left, false); + btrfs_crit(left->fs_info, "right extent buffer:"); + btrfs_print_tree(right, false); btrfs_crit(left->fs_info, "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", left_last.objectid, left_last.type, -- cgit v1.2.3