From fdf8d595f49c79f1c47e5a91c4c6582572c5eeee Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 24 Feb 2023 11:31:26 +0800 Subject: btrfs: open code btrfs_bin_search() btrfs_bin_search() is a simple wrapper that searches for the whole slots by calling btrfs_generic_bin_search() with the starting slot/first_slot preset to 0. This simple wrapper can be open coded as btrfs_bin_search(). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 200cea6e49e5..9ab793b638a7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4099,7 +4099,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); + ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); if (ret < 0) break; -- cgit v1.2.3 From e6b430f817ca7c44a083deab8c54ab8d56e99d54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Apr 2023 16:03:55 +0200 Subject: btrfs: tree-log: factor out a clean_log_buffer helper The tree-log code has three almost identical copies for the accounting on an extent_buffer that doesn't need to be written any more. The only difference is that walk_down_log_tree passed the bytenr used to find the buffer instead of extent_buffer.start and calculates the length using the nodesize, while the other two callers look at the extent_buffer.len field that must always be equivalent to the nodesize. Factor the code into a common helper. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 92 ++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ab793b638a7..f6c3f14fbdb6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) btrfs_put_block_group(cache); } +static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + int ret; + + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + + if (trans) { + ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + if (ret) + return ret; + btrfs_redirty_list_add(trans->transaction, eb); + } else { + unaccount_log_buffer(eb->fs_info, eb->start); + } + + return 0; +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, @@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - u32 blocksize; int ret = 0; while (*level > 0) { @@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, check.level = *level - 1; check.has_first_key = true; btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); - blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), @@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; - } - btrfs_redirty_list_add( - trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, bytenr); + ret = clean_log_buffer(trans, next); + if (ret) { + free_extent_buffer(next); + return ret; } } free_extent_buffer(next); @@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; int i; int slot; int ret; @@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, return ret; if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[*level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; - btrfs_redirty_list_add(trans->transaction, - next); - } else { - unaccount_log_buffer(fs_info, - path->nodes[*level]->start); - } + ret = clean_log_buffer(trans, path->nodes[*level]); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; int wret; int level; @@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, orig_level); if (ret) goto out; - if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[orig_level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - next->start, next->len); - if (ret) - goto out; - btrfs_redirty_list_add(trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, next->start); - } - } + if (wc->free) + ret = clean_log_buffer(trans, path->nodes[orig_level]); } out: -- cgit v1.2.3 From fa4b8cb1738097586f49ec66cd3656cef47ac143 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Apr 2023 18:51:29 +0100 Subject: btrfs: avoid iterating over all indexes when logging directory When logging a directory, after copying all directory index items from the subvolume tree to the log tree, we iterate over the subvolume tree to find all dir index items that are located in leaves COWed (or created) in the current transaction. If we keep logging a directory several times during the same transaction, we end up iterating over the same dir index items everytime we log the directory, wasting time and adding extra lock contention on the subvolume tree. So just keep track of the last logged dir index offset in order to start the search for that index (+1) the next time the directory is logged, as dir index values (key offsets) come from a monotonically increasing counter. The following test measures the difference before and after this change: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT # Time values in milliseconds. declare -a fsync_times # Total number of files added to the test directory. num_files=1000000 # Fsync directory after every N files are added. fsync_period=100 mkdir $MNT/testdir fsync_total_time=0 for ((i = 1; i <= $num_files; i++)); do echo -n > $MNT/testdir/file_$i if [ $((i % fsync_period)) -eq 0 ]; then start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir end=$(date +%s%N) fsync_total_time=$((fsync_total_time + (end - start))) fsync_times[i]=$(( (end - start) / 1000000 )) echo -n -e "Progress $i / $num_files\r" fi done echo -e "\nHistogram of directory fsync duration in ms:\n" printf '%s\n' "${fsync_times[@]}" | \ perl -MStatistics::Histogram -e '@d = <>; print get_histogram(\@d);' fsync_total_time=$((fsync_total_time / 1000000)) echo -e "\nTotal time spent in fsync: $fsync_total_time ms\n" echo umount $MNT The test was run on a non-debug kernel (Debian's default kernel config) against a 15G null block device. Result before this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 362.000; Mean: 34.556; Median: 31.000; Stddev: 25.751 Percentiles: 90th: 71.000; 95th: 77.000; 99th: 81.000 3.000 - 5.278: 1423 ################################# 5.278 - 8.854: 1173 ########################### 8.854 - 14.467: 591 ############## 14.467 - 23.277: 1025 ####################### 23.277 - 37.105: 1422 ################################# 37.105 - 58.809: 2036 ############################################### 58.809 - 92.876: 2316 ##################################################### 92.876 - 146.346: 6 | 146.346 - 230.271: 6 | 230.271 - 362.000: 2 | Total time spent in fsync: 350527 ms Result after this change: Histogram of directory fsync duration in ms: Count: 10000 Range: 3.000 - 1088.000; Mean: 8.704; Median: 8.000; Stddev: 12.576 Percentiles: 90th: 12.000; 95th: 14.000; 99th: 17.000 3.000 - 6.007: 3222 ################################# 6.007 - 11.276: 5197 ##################################################### 11.276 - 20.506: 1551 ################ 20.506 - 36.674: 24 | 36.674 - 201.552: 1 | 201.552 - 353.841: 4 | 353.841 - 1088.000: 1 | Total time spent in fsync: 92114 ms Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 32 +++++++++++++++++++++++++++----- fs/btrfs/tree-log.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bb4984480669..ec2ae4406c16 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -142,11 +142,22 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* - * Total number of bytes pending delalloc, used by stat to calculate the - * real block usage of the file. This is used only for files. - */ - u64 delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The lowest possible index of the next dir index key which + * points to an inode that needs to be logged. + * This is used only for directories. + * Use the helpers btrfs_get_first_dir_index_to_log() and + * btrfs_set_first_dir_index_to_log() to access this field. + */ + u64 first_dir_index_to_log; + }; union { /* @@ -247,6 +258,17 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) +{ + return READ_ONCE(inode->first_dir_index_to_log); +} + +static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, + u64 index) +{ + WRITE_ONCE(inode->first_dir_index_to_log, index); +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f6c3f14fbdb6..d746ebf841fd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3618,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = BTRFS_LOG_FORCE_COMMIT; else inode->last_dir_index_offset = last_index; + + if (btrfs_get_first_dir_index_to_log(inode) == 0) + btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); out: kfree(ins_data); @@ -5376,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; u64 ino = btrfs_ino(start_inode); + struct btrfs_inode *curr_inode = start_inode; int ret = 0; /* @@ -5390,18 +5394,23 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + /* Pairs with btrfs_add_delayed_iput below. */ + ihold(&curr_inode->vfs_inode); + while (true) { + struct inode *vfs_inode; struct extent_buffer *leaf; struct btrfs_key min_key; + u64 next_index; bool continue_curr_inode = true; int nritems; int i; min_key.objectid = ino; min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; + min_key.offset = btrfs_get_first_dir_index_to_log(curr_inode); + next_index = min_key.offset; again: - btrfs_release_path(path); ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret < 0) { break; @@ -5426,6 +5435,8 @@ again: break; } + next_index = min_key.offset + 1; + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) @@ -5466,12 +5477,16 @@ again: break; } + btrfs_release_path(path); + if (continue_curr_inode && min_key.offset < (u64)-1) { min_key.offset++; goto again; } next: + btrfs_set_first_dir_index_to_log(curr_inode, next_index); + if (list_empty(&dir_list)) break; @@ -5479,9 +5494,22 @@ next: ino = dir_elem->ino; list_del(&dir_elem->list); kfree(dir_elem); + + btrfs_add_delayed_iput(curr_inode); + curr_inode = NULL; + + vfs_inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(vfs_inode)) { + ret = PTR_ERR(vfs_inode); + break; + } + curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); + if (curr_inode) + btrfs_add_delayed_iput(curr_inode); + if (ret) { struct btrfs_dir_list *next; -- cgit v1.2.3 From 5d3e4f1d5123d79932177c3e7461a968f412775a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Apr 2023 18:51:30 +0100 Subject: btrfs: use log root when iterating over index keys when logging directory When logging dir dentries of a directory, we iterate over the subvolume tree to find dir index keys on leaves modified in the current transaction. This however is heavy on locking, since btrfs_search_forward() may often keep locks on extent buffers for quite a while when walking the tree to find a suitable leaf modified in the current transaction and with a key not smaller than then the provided minimum key. That means it will block other tasks trying to access the subvolume tree, which may be common fs operations like creating, renaming, linking, unlinking, reflinking files, etc. A better solution is to iterate the log tree, since it's much smaller than a subvolume tree and just use plain btrfs_search_slot() (or the wrapper btrfs_for_each_slot()) and only contains dir index keys added in the current transaction. The following bonnie++ test on a non-debug kernel (with Debian's default kernel config) on a 20G null block device, was used to measure the impact: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 NR_DIRECTORIES=20 NR_FILES=20480 # must be a multiple of 1024 DATASET_SIZE=$(( (8 * 1024 * 1024 * 1024) / 1048576 )) # 8 GiB as megabytes DIRECTORY_SIZE=$(( DATASET_SIZE / NR_FILES )) NR_FILES=$(( NR_FILES / 1024 )) umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT bonnie++ -u root -d $MNT \ -n $NR_FILES:$DIRECTORY_SIZE:$DIRECTORY_SIZE:$NR_DIRECTORIES \ -r 0 -s $DATASET_SIZE -b umount $MNT Before patchset: Version 2.00a ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Name:Size etc /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP debian0 8G 376k 99 1.1g 98 939m 92 1527k 99 3.2g 99 9060 256 Latency 24920us 207us 680ms 5594us 171us 2891us Version 2.00a ------Sequential Create------ --------Random Create-------- debian0 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 20/20 20480 96 +++++ +++ 20480 95 20480 99 +++++ +++ 20480 97 Latency 8708us 137us 5128us 6743us 60us 19712us After patchset: Version 2.00a ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Name:Size etc /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP debian0 8G 384k 99 1.2g 99 971m 91 1533k 99 3.3g 99 9180 309 Latency 24930us 125us 661ms 5587us 46us 2020us Version 2.00a ------Sequential Create------ --------Random Create-------- debian0 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 20/20 20480 90 +++++ +++ 20480 99 20480 99 +++++ +++ 20480 97 Latency 7030us 61us 1246us 4942us 56us 16855us The patchset consists of this patch plus a previous one that has the following subject: "btrfs: avoid iterating over all indexes when logging directory" Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d746ebf841fd..9b212e8c70cc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5399,45 +5399,34 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, while (true) { struct inode *vfs_inode; - struct extent_buffer *leaf; - struct btrfs_key min_key; + struct btrfs_key key; + struct btrfs_key found_key; u64 next_index; bool continue_curr_inode = true; - int nritems; - int i; + int iter_ret; - min_key.objectid = ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = btrfs_get_first_dir_index_to_log(curr_inode); - next_index = min_key.offset; + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = btrfs_get_first_dir_index_to_log(curr_inode); + next_index = key.offset; again: - ret = btrfs_search_forward(root, &min_key, path, trans->transid); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - goto next; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { + btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; struct inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != ino || - min_key.type != BTRFS_DIR_INDEX_KEY) { + if (found_key.objectid != ino || + found_key.type != BTRFS_DIR_INDEX_KEY) { continue_curr_inode = false; break; } - next_index = min_key.offset + 1; + next_index = found_key.offset + 1; - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; @@ -5479,12 +5468,20 @@ again: btrfs_release_path(path); - if (continue_curr_inode && min_key.offset < (u64)-1) { - min_key.offset++; + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } else if (iter_ret > 0) { + continue_curr_inode = false; + } else { + key = found_key; + } + + if (continue_curr_inode && key.offset < (u64)-1) { + key.offset++; goto again; } -next: btrfs_set_first_dir_index_to_log(curr_inode, next_index); if (list_empty(&dir_list)) -- cgit v1.2.3 From 8fd9f4232d8152c650fd15127f533a0f6d0a4b2b Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Tue, 16 May 2023 09:34:30 +0800 Subject: btrfs: fix an uninitialized variable warning in btrfs_log_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following warning reported by gcc 10.2.1 under x86_64: ../fs/btrfs/tree-log.c: In function ‘btrfs_log_inode’: ../fs/btrfs/tree-log.c:6211:9: error: ‘last_range_start’ may be used uninitialized in this function [-Werror=maybe-uninitialized] 6211 | ret = insert_dir_log_key(trans, log, path, key.objectid, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6212 | first_dir_index, last_dir_index); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../fs/btrfs/tree-log.c:6161:6: note: ‘last_range_start’ was declared here 6161 | u64 last_range_start; | ^~~~~~~~~~~~~~~~ This might be a false positive fixed in later compiler versions but we want to have it fixed. Reported-by: k2ci Reviewed-by: Anand Jain Signed-off-by: Shida Zhang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9b212e8c70cc..d2755d5e338b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6158,7 +6158,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; const struct btrfs_delayed_item *curr; - u64 last_range_start; + u64 last_range_start = 0; u64 last_range_end = 0; struct btrfs_key key; -- cgit v1.2.3