diff options
Diffstat (limited to 'fs')
61 files changed, 3709 insertions, 1339 deletions
| diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3cfc440c636c..2d5f0482678b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \  	tests/extent-buffer-tests.o tests/btrfs-tests.o \  	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \  	tests/free-space-tree-tests.o tests/extent-map-tests.o \ -	tests/raid-stripe-tree-tests.o +	tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 361a866c1995..a4c51600a408 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -18,7 +18,7 @@ enum {  };  #define NO_THRESHOLD (-1) -#define DFT_THRESHOLD (32) +#define DEFAULT_THRESHOLD (32)  struct btrfs_workqueue {  	struct workqueue_struct *normal_wq; @@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,  	ret->limit_active = limit_active;  	if (thresh == 0) -		thresh = DFT_THRESHOLD; +		thresh = DEFAULT_THRESHOLD;  	/* For low threshold, disabling threshold is a better choice */ -	if (thresh < DFT_THRESHOLD) { +	if (thresh < DEFAULT_THRESHOLD) {  		ret->current_active = limit_active;  		ret->thresh = NO_THRESHOLD;  	} else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04f53ca548e1..3d3923cfc357 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1,  	return 0;  } +static int prelim_ref_rb_add_cmp(const struct rb_node *new, +				 const struct rb_node *exist) +{ +	const struct prelim_ref *ref_new = +		rb_entry(new, struct prelim_ref, rbnode); +	const struct prelim_ref *ref_exist = +		rb_entry(exist, struct prelim_ref, rbnode); + +	/* +	 * prelim_ref_compare() expects the first parameter as the existing one, +	 * different from the rb_find_add_cached() order. +	 */ +	return prelim_ref_compare(ref_exist, ref_new); +} +  static void update_share_count(struct share_check *sc, int oldcount,  			       int newcount, const struct prelim_ref *newref)  { @@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,  			      struct share_check *sc)  {  	struct rb_root_cached *root; -	struct rb_node **p; -	struct rb_node *parent = NULL; -	struct prelim_ref *ref; -	int result; -	bool leftmost = true; +	struct rb_node *exist;  	root = &preftree->root; -	p = &root->rb_root.rb_node; +	exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp); +	if (exist) { +		struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode); +		/* Identical refs, merge them and free @newref */ +		struct extent_inode_elem *eie = ref->inode_list; -	while (*p) { -		parent = *p; -		ref = rb_entry(parent, struct prelim_ref, rbnode); -		result = prelim_ref_compare(ref, newref); -		if (result < 0) { -			p = &(*p)->rb_left; -		} else if (result > 0) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else { -			/* Identical refs, merge them and free @newref */ -			struct extent_inode_elem *eie = ref->inode_list; - -			while (eie && eie->next) -				eie = eie->next; +		while (eie && eie->next) +			eie = eie->next; -			if (!eie) -				ref->inode_list = newref->inode_list; -			else -				eie->next = newref->inode_list; -			trace_btrfs_prelim_ref_merge(fs_info, ref, newref, -						     preftree->count); -			/* -			 * A delayed ref can have newref->count < 0. -			 * The ref->count is updated to follow any -			 * BTRFS_[ADD|DROP]_DELAYED_REF actions. -			 */ -			update_share_count(sc, ref->count, -					   ref->count + newref->count, newref); -			ref->count += newref->count; -			free_pref(newref); -			return; -		} +		if (!eie) +			ref->inode_list = newref->inode_list; +		else +			eie->next = newref->inode_list; +		trace_btrfs_prelim_ref_merge(fs_info, ref, newref, +							preftree->count); +		/* +		 * A delayed ref can have newref->count < 0. +		 * The ref->count is updated to follow any +		 * BTRFS_[ADD|DROP]_DELAYED_REF actions. +		 */ +		update_share_count(sc, ref->count, +					ref->count + newref->count, newref); +		ref->count += newref->count; +		free_pref(newref); +		return;  	}  	update_share_count(sc, 0, newref->count, newref);  	preftree->count++;  	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); -	rb_link_node(&newref->rbnode, parent, p); -	rb_insert_color_cached(&newref->rbnode, root, leftmost);  }  /* @@ -3022,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,  	cache->rb_root = RB_ROOT;  	for (i = 0; i < BTRFS_MAX_LEVEL; i++)  		INIT_LIST_HEAD(&cache->pending[i]); -	INIT_LIST_HEAD(&cache->changed); -	INIT_LIST_HEAD(&cache->detached); -	INIT_LIST_HEAD(&cache->leaves);  	INIT_LIST_HEAD(&cache->pending_edge);  	INIT_LIST_HEAD(&cache->useless_node);  	cache->fs_info = fs_info; @@ -3132,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,  void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,  				struct btrfs_backref_node *node)  { -	struct btrfs_backref_node *upper;  	struct btrfs_backref_edge *edge;  	if (!node)  		return; -	BUG_ON(!node->lowest && !node->detached);  	while (!list_empty(&node->upper)) {  		edge = list_entry(node->upper.next, struct btrfs_backref_edge,  				  list[LOWER]); -		upper = edge->node[UPPER];  		list_del(&edge->list[LOWER]);  		list_del(&edge->list[UPPER]);  		btrfs_backref_free_edge(cache, edge); - -		/* -		 * Add the node to leaf node list if no other child block -		 * cached. -		 */ -		if (list_empty(&upper->lower)) { -			list_add_tail(&upper->lower, &cache->leaves); -			upper->lowest = 1; -		}  	}  	btrfs_backref_drop_node(cache, node); @@ -3166,33 +3150,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,  void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)  {  	struct btrfs_backref_node *node; -	int i; - -	while (!list_empty(&cache->detached)) { -		node = list_entry(cache->detached.next, -				  struct btrfs_backref_node, list); -		btrfs_backref_cleanup_node(cache, node); -	} -	while (!list_empty(&cache->leaves)) { -		node = list_entry(cache->leaves.next, -				  struct btrfs_backref_node, lower); +	while ((node = rb_entry_safe(rb_first(&cache->rb_root), +				     struct btrfs_backref_node, rb_node)))  		btrfs_backref_cleanup_node(cache, node); -	} -	for (i = 0; i < BTRFS_MAX_LEVEL; i++) { -		while (!list_empty(&cache->pending[i])) { -			node = list_first_entry(&cache->pending[i], -						struct btrfs_backref_node, -						list); -			btrfs_backref_cleanup_node(cache, node); -		} -	}  	ASSERT(list_empty(&cache->pending_edge));  	ASSERT(list_empty(&cache->useless_node)); -	ASSERT(list_empty(&cache->changed)); -	ASSERT(list_empty(&cache->detached)); -	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));  	ASSERT(!cache->nr_nodes);  	ASSERT(!cache->nr_edges);  } @@ -3316,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,  	root = btrfs_get_fs_root(fs_info, ref_key->offset, false);  	if (IS_ERR(root))  		return PTR_ERR(root); -	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) -		cur->cowonly = 1; + +	/* We shouldn't be using backref cache for non-shareable roots. */ +	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { +		btrfs_put_root(root); +		return -EUCLEAN; +	}  	if (btrfs_root_level(&root->root_item) == cur->level) {  		/* Tree root */ @@ -3403,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,  				goto out;  			}  			upper->owner = btrfs_header_owner(eb); -			if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) -				upper->cowonly = 1; + +			/* We shouldn't be using backref cache for non shareable roots. */ +			if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { +				btrfs_put_root(root); +				btrfs_backref_free_edge(cache, edge); +				btrfs_backref_free_node(cache, upper); +				ret = -EUCLEAN; +				goto out; +			}  			/*  			 * If we know the block isn't shared we can avoid @@ -3595,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,  	ASSERT(start->checked); -	/* Insert this node to cache if it's not COW-only */ -	if (!start->cowonly) { -		rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, -					   &start->rb_node); -		if (rb_node) -			btrfs_backref_panic(cache->fs_info, start->bytenr, -					    -EEXIST); -		list_add_tail(&start->lower, &cache->leaves); -	} +	rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); +	if (rb_node) +		btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);  	/*  	 * Use breadth first search to iterate all related edges. @@ -3642,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,  		 * parents have already been linked.  		 */  		if (!RB_EMPTY_NODE(&upper->rb_node)) { -			if (upper->lowest) { -				list_del_init(&upper->lower); -				upper->lowest = 0; -			} -  			list_add_tail(&edge->list[UPPER], &upper->lower);  			continue;  		} @@ -3657,23 +3621,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,  			return -EUCLEAN;  		} -		/* Sanity check, COW-only node has non-COW-only parent */ -		if (start->cowonly != upper->cowonly) { -			ASSERT(0); +		rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, +					   &upper->rb_node); +		if (unlikely(rb_node)) { +			btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);  			return -EUCLEAN;  		} -		/* Only cache non-COW-only (subvolume trees) tree blocks */ -		if (!upper->cowonly) { -			rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, -						   &upper->rb_node); -			if (rb_node) { -				btrfs_backref_panic(cache->fs_info, -						upper->bytenr, -EEXIST); -				return -EUCLEAN; -			} -		} -  		list_add_tail(&edge->list[UPPER], &upper->lower);  		/* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c1..74e614031274 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -318,6 +318,12 @@ struct btrfs_backref_node {  		u64 bytenr;  	}; /* Use rb_simple_node for search/insert */ +	/* +	 * This is a sanity check, whenever we COW a block we will update +	 * new_bytenr with it's current location, and we will check this in +	 * various places to validate that the cache makes sense, it shouldn't +	 * be used for anything else. +	 */  	u64 new_bytenr;  	/* Objectid of tree block owner, can be not uptodate */  	u64 owner; @@ -335,10 +341,6 @@ struct btrfs_backref_node {  	struct extent_buffer *eb;  	/* Level of the tree block */  	unsigned int level:8; -	/* Is the block in a non-shareable tree */ -	unsigned int cowonly:1; -	/* 1 if no child node is in the cache */ -	unsigned int lowest:1;  	/* Is the extent buffer locked */  	unsigned int locked:1;  	/* Has the block been processed */ @@ -391,12 +393,6 @@ struct btrfs_backref_cache {  	 * level blocks may not reflect the new location  	 */  	struct list_head pending[BTRFS_MAX_LEVEL]; -	/* List of backref nodes with no child node */ -	struct list_head leaves; -	/* List of blocks that have been COWed in current transaction */ -	struct list_head changed; -	/* List of detached backref node. */ -	struct list_head detached;  	u64 last_trans; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 7ea6f0b43b95..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)  		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),  		dev->devid, bio->bi_iter.bi_size); +	/* +	 * Track reads if tracking is enabled; ignore I/O operations before the +	 * filesystem is fully initialized. +	 */ +	if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) +		percpu_counter_add(&dev->fs_info->stats_read_blocks, +				   bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); +  	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)  		blkcg_punt_bio_submit(bio);  	else @@ -725,8 +733,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)  			bio->bi_opf |= REQ_OP_ZONE_APPEND;  		} -		if (is_data_bbio(bbio) && bioc && -		    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { +		if (is_data_bbio(bbio) && bioc && bioc->use_rst) {  			/*  			 * No locking for the list update, as we only add to  			 * the list in the I/O submission path, and list diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4427c1b835e8..c0a8f7d92acc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)  	}  } +static int btrfs_bg_start_cmp(const struct rb_node *new, +			      const struct rb_node *exist) +{ +	const struct btrfs_block_group *new_bg = +		rb_entry(new, struct btrfs_block_group, cache_node); +	const struct btrfs_block_group *exist_bg = +		rb_entry(exist, struct btrfs_block_group, cache_node); + +	if (new_bg->start < exist_bg->start) +		return -1; +	if (new_bg->start > exist_bg->start) +		return 1; +	return 0; +} +  /*   * This adds the block group to the fs_info rb tree for the block group cache   */  static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,  				       struct btrfs_block_group *block_group)  { -	struct rb_node **p; -	struct rb_node *parent = NULL; -	struct btrfs_block_group *cache; -	bool leftmost = true; +	struct rb_node *exist; +	int ret = 0;  	ASSERT(block_group->length != 0);  	write_lock(&info->block_group_cache_lock); -	p = &info->block_group_cache_tree.rb_root.rb_node; - -	while (*p) { -		parent = *p; -		cache = rb_entry(parent, struct btrfs_block_group, cache_node); -		if (block_group->start < cache->start) { -			p = &(*p)->rb_left; -		} else if (block_group->start > cache->start) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else { -			write_unlock(&info->block_group_cache_lock); -			return -EEXIST; -		} -	} - -	rb_link_node(&block_group->cache_node, parent, p); -	rb_insert_color_cached(&block_group->cache_node, -			       &info->block_group_cache_tree, leftmost); +	exist = rb_find_add_cached(&block_group->cache_node, +			&info->block_group_cache_tree, btrfs_bg_start_cmp); +	if (exist) +		ret = -EEXIST;  	write_unlock(&info->block_group_cache_lock); -	return 0; +	return ret;  }  /* @@ -1223,7 +1221,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	block_group->space_info->total_bytes -= block_group->length;  	block_group->space_info->bytes_readonly -=  		(block_group->length - block_group->zone_unusable); -	btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, +	btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,  						    -block_group->zone_unusable);  	block_group->space_info->disk_total -= block_group->length * factor; @@ -1396,8 +1394,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)  		if (btrfs_is_zoned(cache->fs_info)) {  			/* Migrate zone_unusable bytes to readonly */  			sinfo->bytes_readonly += cache->zone_unusable; -			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, -								    -cache->zone_unusable); +			btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);  			cache->zone_unusable = 0;  		}  		cache->ro++; @@ -1645,8 +1642,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		spin_lock(&space_info->lock);  		spin_lock(&block_group->lock); -		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -						     -block_group->pinned); +		btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);  		space_info->bytes_readonly += block_group->pinned;  		block_group->pinned = 0; @@ -2672,7 +2668,6 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,  	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);  	btrfs_set_dev_extent_length(leaf, extent, num_bytes); -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	return ret; @@ -3060,8 +3055,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)  				(cache->alloc_offset - cache->used - cache->pinned -  				 cache->reserved) +  				(cache->length - cache->zone_capacity); -			btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, -								    cache->zone_unusable); +			btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);  			sinfo->bytes_readonly -= cache->zone_unusable;  		}  		num_bytes = cache->length - cache->reserved - @@ -3123,7 +3117,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,  						   cache->global_root_id);  	btrfs_set_stack_block_group_flags(&bgi, cache->flags);  	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); -	btrfs_mark_buffer_dirty(trans, leaf);  fail:  	btrfs_release_path(path);  	/* @@ -3699,7 +3692,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  		old_val -= num_bytes;  		cache->used = old_val;  		cache->pinned += num_bytes; -		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); +		btrfs_space_info_update_bytes_pinned(space_info, num_bytes);  		space_info->bytes_used -= num_bytes;  		space_info->disk_used -= num_bytes * factor;  		if (READ_ONCE(space_info->periodic_reclaim)) @@ -3781,8 +3774,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,  	space_info->bytes_reserved += num_bytes;  	trace_btrfs_space_reservation(cache->fs_info, "space_info",  				      space_info->flags, num_bytes, 1); -	btrfs_space_info_update_bytes_may_use(cache->fs_info, -					      space_info, -ram_bytes); +	btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);  	if (delalloc)  		cache->delalloc_bytes += num_bytes; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a07b9594dc70..3f3608299c0b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,  			spin_unlock(&dest->lock);  		}  		if (num_bytes) -			btrfs_space_info_free_bytes_may_use(fs_info, -							    space_info, -							    num_bytes); +			btrfs_space_info_free_bytes_may_use(space_info, num_bytes);  	}  	if (qgroup_to_release_ret)  		*qgroup_to_release_ret = qgroup_to_release; @@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)  	if (block_rsv->reserved < block_rsv->size) {  		num_bytes = block_rsv->size - block_rsv->reserved; -		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, -						      num_bytes); +		btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);  		block_rsv->reserved = block_rsv->size;  	} else if (block_rsv->reserved > block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size; -		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, -						      -num_bytes); +		btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);  		block_rsv->reserved = block_rsv->size;  		btrfs_try_granting_tickets(fs_info, sinfo);  	} diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index aa1f55cd81b7..b2fa33911c28 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,  			u32 bio_offset, struct bio_vec *bv);  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  			      struct btrfs_file_extent *file_extent, -			      bool nowait, bool strict); +			      bool nowait);  void btrfs_del_delalloc_inode(struct btrfs_inode *inode);  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 185985a337b3..92071ca0655f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,  static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *dst_buf,  			      struct extent_buffer *src_buf); - -static const struct btrfs_csums { -	u16		size; -	const char	name[10]; -	const char	driver[12]; -} btrfs_csums[] = { -	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, -	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, -	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, -	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", -				     .driver = "blake2b-256" }, -}; -  /*   * The leaf data grows from end-to-front in the node.  this returns the address   * of the start of the last item, which is the stop of the leaf data stack. @@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,  			      nr_items * sizeof(struct btrfs_item));  } -/* This exists for btrfs-progs usages. */ -u16 btrfs_csum_type_size(u16 type) -{ -	return btrfs_csums[type].size; -} - -int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ -	u16 t = btrfs_super_csum_type(s); -	/* -	 * csum type is validated at mount time -	 */ -	return btrfs_csum_type_size(t); -} - -const char *btrfs_super_csum_name(u16 csum_type) -{ -	/* csum type is validated at mount time */ -	return btrfs_csums[csum_type].name; -} - -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ -	/* csum type is validated at mount time */ -	return btrfs_csums[csum_type].driver[0] ? -		btrfs_csums[csum_type].driver : -		btrfs_csums[csum_type].name; -} - -size_t __attribute_const__ btrfs_get_num_csums(void) -{ -	return ARRAY_SIZE(btrfs_csums); -} -  struct btrfs_path *btrfs_alloc_path(void)  {  	might_sleep(); @@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)  }  /* - * We want the transaction abort to print stack trace only for errors where the - * cause could be a bug, eg. due to ENOSPC, and not for common errors that are - * caused by external factors. - */ -bool __cold abort_should_print_stack(int error) -{ -	switch (error) { -	case -EIO: -	case -EROFS: -	case -ENOMEM: -		return false; -	} -	return true; -} - -/*   * safely gets a reference on the root node of a tree.  A lock   * is not taken, so a concurrent writer may put a different node   * at the root of the tree.  See btrfs_lock_root_node for the @@ -3900,6 +3833,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);  	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && +	       key.type != BTRFS_RAID_STRIPE_KEY &&  	       key.type != BTRFS_EXTENT_CSUM_KEY);  	if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c341956a01c..1096a80a64e7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,7 +7,6 @@  #define BTRFS_CTREE_H  #include "linux/cleanup.h" -#include <linux/pagemap.h>  #include <linux/spinlock.h>  #include <linux/rbtree.h>  #include <linux/mutex.h> @@ -506,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)  	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);  } -#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ -				((bytes) >> (fs_info)->sectorsize_bits) - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ -	return mapping_gfp_constraint(mapping, ~__GFP_FS); -} - -void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, -			 u64 num_bytes, u64 *actual_bytes); -int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - -/* ctree.c */  int __init btrfs_ctree_init(void);  void __cold btrfs_ctree_exit(void); @@ -756,18 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)  	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;  } -u16 btrfs_csum_type_size(u16 type); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - -/* - * We use folio flag owner_2 to indicate there is an ordered extent with - * unfinished IO. - */ -#define folio_test_ordered(folio)	folio_test_owner_2(folio) -#define folio_set_ordered(folio)	folio_set_owner_2(folio) -#define folio_clear_ordered(folio)	folio_clear_owner_2(folio) -  #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7aa8a395d838..88e900e5a43d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,  	ASSERT(IS_ALIGNED(len, fs_info->sectorsize));  	data_sinfo = fs_info->data_sinfo; -	btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); +	btrfs_space_info_free_bytes_may_use(data_sinfo, len);  }  /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 508bdbae29a0..0b4933c6a889 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -366,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(  	return NULL;  } +static int btrfs_delayed_item_cmp(const struct rb_node *new, +				  const struct rb_node *exist) +{ +	const struct btrfs_delayed_item *new_item = +		rb_entry(new, struct btrfs_delayed_item, rb_node); +	const struct btrfs_delayed_item *exist_item = +		rb_entry(exist, struct btrfs_delayed_item, rb_node); + +	if (new_item->index < exist_item->index) +		return -1; +	if (new_item->index > exist_item->index) +		return 1; +	return 0; +} +  static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,  				    struct btrfs_delayed_item *ins)  { -	struct rb_node **p, *node; -	struct rb_node *parent_node = NULL;  	struct rb_root_cached *root; -	struct btrfs_delayed_item *item; -	bool leftmost = true; +	struct rb_node *exist;  	if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)  		root = &delayed_node->ins_root;  	else  		root = &delayed_node->del_root; -	p = &root->rb_root.rb_node; -	node = &ins->rb_node; - -	while (*p) { -		parent_node = *p; -		item = rb_entry(parent_node, struct btrfs_delayed_item, -				 rb_node); - -		if (item->index < ins->index) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else if (item->index > ins->index) { -			p = &(*p)->rb_left; -		} else { -			return -EEXIST; -		} -	} - -	rb_link_node(node, parent_node, p); -	rb_insert_color_cached(node, root, leftmost); +	exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); +	if (exist) +		return -EEXIST;  	if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&  	    ins->index >= delayed_node->index_cnt) @@ -1038,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,  				    struct btrfs_inode_item);  	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,  			    sizeof(struct btrfs_inode_item)); -	btrfs_mark_buffer_dirty(trans, leaf);  	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))  		goto out; @@ -1561,8 +1555,7 @@ release_node:  	return ret;  } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, -					       struct btrfs_delayed_node *node, +static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,  					       u64 index)  {  	struct btrfs_delayed_item *item; @@ -1620,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,  	if (IS_ERR(node))  		return PTR_ERR(node); -	ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); +	ret = btrfs_delete_delayed_insertion_item(node, index);  	if (!ret)  		goto end; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0d878dbbabba..98c5b61dabe8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)  	u64 num_bytes;  	u64 reserved_bytes; +	if (btrfs_is_testing(fs_info)) +		return; +  	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);  	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,  						       trans->delayed_ref_csum_deletions); @@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,  	spin_unlock(&block_rsv->lock);  	if (to_free > 0) -		btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); +		btrfs_space_info_free_bytes_may_use(space_info, to_free);  	if (refilled_bytes > 0)  		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -265,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,  /*   * compare two delayed data backrefs with same bytenr and type   */ -static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, -			  struct btrfs_delayed_ref_node *ref2) +static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, +			  const struct btrfs_delayed_ref_node *ref2)  {  	if (ref1->data_ref.objectid < ref2->data_ref.objectid)  		return -1; @@ -279,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,  	return 0;  } -static int comp_refs(struct btrfs_delayed_ref_node *ref1, -		     struct btrfs_delayed_ref_node *ref2, +static int comp_refs(const struct btrfs_delayed_ref_node *ref1, +		     const struct btrfs_delayed_ref_node *ref2,  		     bool check_seq)  {  	int ret = 0; @@ -314,34 +317,25 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,  	return 0;  } +static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) +{ +	const struct btrfs_delayed_ref_node *new_node = +		rb_entry(new, struct btrfs_delayed_ref_node, ref_node); +	const struct btrfs_delayed_ref_node *exist_node = +		rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); + +	return comp_refs(new_node, exist_node, true); +} +  static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,  		struct btrfs_delayed_ref_node *ins)  { -	struct rb_node **p = &root->rb_root.rb_node;  	struct rb_node *node = &ins->ref_node; -	struct rb_node *parent_node = NULL; -	struct btrfs_delayed_ref_node *entry; -	bool leftmost = true; - -	while (*p) { -		int comp; - -		parent_node = *p; -		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, -				 ref_node); -		comp = comp_refs(ins, entry, true); -		if (comp < 0) { -			p = &(*p)->rb_left; -		} else if (comp > 0) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else { -			return entry; -		} -	} +	struct rb_node *exist; -	rb_link_node(node, parent_node, p); -	rb_insert_color_cached(node, root, leftmost); +	exist = rb_find_add_cached(node, root, cmp_refs_node); +	if (exist) +		return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);  	return NULL;  } @@ -555,6 +549,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,  		delayed_refs->num_heads_ready--;  } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ +	struct btrfs_delayed_ref_node *ref; + +	lockdep_assert_held(&head->mutex); +	lockdep_assert_held(&head->lock); + +	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) +		return NULL; + +	/* +	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. +	 * This is to prevent a ref count from going down to zero, which deletes +	 * the extent item from the extent tree, when there still are references +	 * to add, which would fail because they would not find the extent item. +	 */ +	if (!list_empty(&head->ref_add_list)) +		return list_first_entry(&head->ref_add_list, +					struct btrfs_delayed_ref_node, add_list); + +	ref = rb_entry(rb_first_cached(&head->ref_tree), +		       struct btrfs_delayed_ref_node, ref_node); +	ASSERT(list_empty(&ref->add_list)); +	return ref; +} +  /*   * Helper to insert the ref_node to the tail or merge with tail.   * @@ -1234,6 +1254,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)  {  	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;  	struct btrfs_fs_info *fs_info = trans->fs_info; +	bool testing = btrfs_is_testing(fs_info);  	spin_lock(&delayed_refs->lock);  	while (true) { @@ -1263,7 +1284,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)  		spin_unlock(&delayed_refs->lock);  		mutex_unlock(&head->mutex); -		if (pin_bytes) { +		if (!testing && pin_bytes) {  			struct btrfs_block_group *bg;  			bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1281,8 +1302,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)  				spin_lock(&bg->space_info->lock);  				spin_lock(&bg->lock);  				bg->pinned += head->num_bytes; -				btrfs_space_info_update_bytes_pinned(fs_info, -								     bg->space_info, +				btrfs_space_info_update_bytes_pinned(bg->space_info,  								     head->num_bytes);  				bg->reserved -= head->num_bytes;  				bg->space_info->bytes_reserved -= head->num_bytes; @@ -1295,12 +1315,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)  			btrfs_error_unpin_extent_range(fs_info, head->bytenr,  				head->bytenr + head->num_bytes - 1);  		} -		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); +		if (!testing) +			btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);  		btrfs_put_delayed_ref_head(head);  		cond_resched();  		spin_lock(&delayed_refs->lock);  	} -	btrfs_qgroup_destroy_extent_records(trans); + +	if (!testing) +		btrfs_qgroup_destroy_extent_records(trans);  	spin_unlock(&delayed_refs->lock);  } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 611fb3388f82..a35067cebb97 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(  		struct btrfs_delayed_ref_root *delayed_refs);  void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,  			     struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);  int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ac8e97ed13f7..f86fbea0b3de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,9 +440,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)  		dev_replace->cursor_right);  	dev_replace->item_needs_writeback = 0;  	up_write(&dev_replace->rwsem); - -	btrfs_mark_buffer_dirty(trans, eb); -  out:  	btrfs_free_path(path); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1ea5d8fcfbf7..ccf91de29f80 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -92,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,  	write_extent_buffer(leaf, name, name_ptr, name_len);  	write_extent_buffer(leaf, data, data_ptr, data_len); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	return ret;  } @@ -152,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,  	name_ptr = (unsigned long)(dir_item + 1);  	write_extent_buffer(leaf, name->name, name_ptr, name->len); -	btrfs_mark_buffer_dirty(trans, leaf);  second_insert:  	/* FIXME, use some real flag for selecting the extra index */ diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index a7c3e221378d..8567af46e16f 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,  		len = min(len, em->len - (start - em->start));  		block_start = extent_map_block_start(em) + (start - em->start); -		if (can_nocow_extent(inode, start, &len, -				     &file_extent, false, false) == 1) { +		if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {  			bg = btrfs_inc_nocow_writers(fs_info, block_start);  			if (bg)  				can_nocow = true; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eff0dd1ae62f..f09db62e61a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -226,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,  	while (1) {  		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); -		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); +		ret = read_extent_buffer_pages(eb, mirror_num, check);  		if (!ret)  			break; @@ -1258,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)  {  	struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; +	percpu_counter_destroy(&fs_info->stats_read_blocks);  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);  	percpu_counter_destroy(&fs_info->delalloc_bytes);  	percpu_counter_destroy(&fs_info->ordered_bytes); @@ -2327,6 +2328,71 @@ out:  	return ret;  } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, +				    const struct btrfs_super_block *sb) +{ +	unsigned int cur = 0; /* Offset inside the sys chunk array */ +	/* +	 * At sb read time, fs_info is not fully initialized. Thus we have +	 * to use super block sectorsize, which should have been validated. +	 */ +	const u32 sectorsize = btrfs_super_sectorsize(sb); +	u32 sys_array_size = btrfs_super_sys_array_size(sb); + +	if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { +		btrfs_err(fs_info, "system chunk array too big %u > %u", +			  sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); +		return -EUCLEAN; +	} + +	while (cur < sys_array_size) { +		struct btrfs_disk_key *disk_key; +		struct btrfs_chunk *chunk; +		struct btrfs_key key; +		u64 type; +		u16 num_stripes; +		u32 len; +		int ret; + +		disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); +		len = sizeof(*disk_key); + +		if (cur + len > sys_array_size) +			goto short_read; +		cur += len; + +		btrfs_disk_key_to_cpu(&key, disk_key); +		if (key.type != BTRFS_CHUNK_ITEM_KEY) { +			btrfs_err(fs_info, +			    "unexpected item type %u in sys_array at offset %u", +				  key.type, cur); +			return -EUCLEAN; +		} +		chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); +		num_stripes = btrfs_stack_chunk_num_stripes(chunk); +		if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) +			goto short_read; +		type = btrfs_stack_chunk_type(chunk); +		if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { +			btrfs_err(fs_info, +			"invalid chunk type %llu in sys_array at offset %u", +				  type, cur); +			return -EUCLEAN; +		} +		ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, +					      sectorsize); +		if (ret < 0) +			return ret; +		cur += btrfs_chunk_item_size(num_stripes); +	} +	return 0; +short_read: +	btrfs_err(fs_info, +	"super block sys chunk array short read, cur=%u sys_array_size=%u", +		  cur, sys_array_size); +	return -EUCLEAN; +} +  /*   * Real super block validation   * NOTE: super csum type and incompat features will not be checked here. @@ -2495,6 +2561,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,  		ret = -EINVAL;  	} +	ret = validate_sys_chunk_array(fs_info, sb); +  	/*  	 * Obvious sys_chunk_array corruptions, it must hold at least one key  	 * and one chunk @@ -2856,6 +2924,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block  	if (ret)  		return ret; +	ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); +	if (ret) +		return ret; +  	fs_info->dirty_metadata_batch = PAGE_SIZE *  					(1 + ilog2(nr_cpu_ids)); @@ -3321,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device  	fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);  	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;  	fs_info->stripesize = stripesize; +	fs_info->fs_devices->fs_info = fs_info;  	/*  	 * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a7051e2570c1..587842991b24 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -96,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);  /*   * This function is used to grab the root, and avoid it is freed when we   * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * 	fs_info->subvol_srcu   */  static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)  { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3c6f7fecbb9a..3014a1a23efd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -570,7 +570,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,  			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);  		}  	} -	btrfs_mark_buffer_dirty(trans, leaf);  	ret = 0;  fail:  	btrfs_release_path(path); @@ -618,7 +617,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);  		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)  			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); -		btrfs_mark_buffer_dirty(trans, leaf);  	}  	return ret;  } @@ -1050,7 +1048,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,  	} else {  		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);  	} -	btrfs_mark_buffer_dirty(trans, leaf);  }  static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1195,7 +1192,6 @@ static noinline_for_stack int update_inline_extent_backref(  		item_size -= size;  		btrfs_truncate_item(trans, path, item_size, 1);  	} -	btrfs_mark_buffer_dirty(trans, leaf);  	return 0;  } @@ -1260,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,  {  	int j, ret = 0;  	u64 bytes_left, end; -	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); +	u64 aligned_start = ALIGN(start, SECTOR_SIZE);  	/* Adjust the range to be aligned to 512B sectors if necessary. */  	if (start != aligned_start) {  		len -= aligned_start - start; -		len = round_down(len, 1 << SECTOR_SHIFT); +		len = round_down(len, SECTOR_SIZE);  		start = aligned_start;  	} @@ -1527,7 +1523,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  	if (extent_op)  		__run_delayed_extent_op(extent_op, leaf, item); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	/* now insert the actual backref */ @@ -1711,8 +1706,6 @@ again:  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	__run_delayed_extent_op(extent_op, leaf, ei); - -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	return ret; @@ -1803,30 +1796,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  	return ret;  } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ -	struct btrfs_delayed_ref_node *ref; - -	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) -		return NULL; - -	/* -	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. -	 * This is to prevent a ref count from going down to zero, which deletes -	 * the extent item from the extent tree, when there still are references -	 * to add, which would fail because they would not find the extent item. -	 */ -	if (!list_empty(&head->ref_add_list)) -		return list_first_entry(&head->ref_add_list, -				struct btrfs_delayed_ref_node, add_list); - -	ref = rb_entry(rb_first_cached(&head->ref_tree), -		       struct btrfs_delayed_ref_node, ref_node); -	ASSERT(list_empty(&ref->add_list)); -	return ref; -} -  static struct btrfs_delayed_extent_op *cleanup_extent_op(  				struct btrfs_delayed_ref_head *head)  { @@ -1959,7 +1928,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,  	lockdep_assert_held(&locked_ref->mutex);  	lockdep_assert_held(&locked_ref->lock); -	while ((ref = select_delayed_ref(locked_ref))) { +	while ((ref = btrfs_select_delayed_ref(locked_ref))) {  		if (ref->seq &&  		    btrfs_check_delayed_seq(fs_info, ref->seq)) {  			spin_unlock(&locked_ref->lock); @@ -2230,10 +2199,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  	return ret;  } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode,  				      struct btrfs_path *path, -				      u64 objectid, u64 offset, u64 bytenr) +				      u64 offset, u64 bytenr)  { +	struct btrfs_root *root = inode->root;  	struct btrfs_delayed_ref_head *head;  	struct btrfs_delayed_ref_node *ref;  	struct btrfs_delayed_ref_root *delayed_refs; @@ -2307,7 +2277,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,  		 * then we have a cross reference.  		 */  		if (ref->ref_root != btrfs_root_id(root) || -		    ref_owner != objectid || ref_offset != offset) { +		    ref_owner != btrfs_ino(inode) || ref_offset != offset) {  			ret = 1;  			break;  		} @@ -2318,11 +2288,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root,  	return ret;  } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode:     The only inode we expect to find associated with the data extent. + * @path:      A path to use for searching the extent tree. + * @offset:    The only offset we expect to find associated with the data extent. + * @bytenr:    The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + *       quick here as we're called in write paths (when flushing delalloc and + *       in the direct IO write path). For example we can have an extent with + *       a single reference but that reference is not inlined, or we may have + *       many references in the extent tree but we also have delayed references + *       that cancel all the reference except the one for our inode and offset, + *       but it would be expensive to do such checks and complex due to all + *       locking to avoid races between the checks and flushing delayed refs, + *       plus non-inline references may be located on leaves other than the one + *       that contains the extent item in the extent tree. The important thing + *       here is to not return false negatives and that the false positives are + *       not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + *          leaf from the extent tree that contains the extent's extent item. + * + *          1 if there are cross references (false positives can happen). + * + *          < 0 in case of an error. In case of -ENOENT the leaf in the extent + *          tree where the extent item should be located at is read locked and + *          accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode,  					struct btrfs_path *path, -					u64 objectid, u64 offset, u64 bytenr, -					bool strict) +					u64 offset, u64 bytenr)  { +	struct btrfs_root *root = inode->root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);  	struct extent_buffer *leaf; @@ -2341,35 +2353,32 @@ static noinline int check_committed_ref(struct btrfs_root *root,  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);  	if (ret < 0) -		goto out; +		return ret;  	if (ret == 0) {  		/*  		 * Key with offset -1 found, there would have to exist an extent  		 * item with such offset, but this is out of the valid range.  		 */ -		ret = -EUCLEAN; -		goto out; +		return -EUCLEAN;  	} -	ret = -ENOENT;  	if (path->slots[0] == 0) -		goto out; +		return -ENOENT;  	path->slots[0]--;  	leaf = path->nodes[0];  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);  	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) -		goto out; +		return -ENOENT; -	ret = 1;  	item_size = btrfs_item_size(leaf, path->slots[0]);  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);  	/* No inline refs; we need to bail before checking for owner ref. */  	if (item_size == sizeof(*ei)) -		goto out; +		return 1;  	/* Check for an owner ref; skip over it to the real inline refs. */  	iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2377,56 +2386,69 @@ static noinline int check_committed_ref(struct btrfs_root *root,  	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {  		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);  		iref = (struct btrfs_extent_inline_ref *)(iref + 1); +		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);  	}  	/* If extent item has more than 1 inline ref then it's shared */  	if (item_size != expected_size) -		goto out; - -	/* -	 * If extent created before last snapshot => it's shared unless the -	 * snapshot has been deleted. Use the heuristic if strict is false. -	 */ -	if (!strict && -	    (btrfs_extent_generation(leaf, ei) <= -	     btrfs_root_last_snapshot(&root->root_item))) -		goto out; +		return 1;  	/* If this extent has SHARED_DATA_REF then it's shared */ -	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);  	if (type != BTRFS_EXTENT_DATA_REF_KEY) -		goto out; +		return 1;  	ref = (struct btrfs_extent_data_ref *)(&iref->offset);  	if (btrfs_extent_refs(leaf, ei) !=  	    btrfs_extent_data_ref_count(leaf, ref) ||  	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || -	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid || +	    btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||  	    btrfs_extent_data_ref_offset(leaf, ref) != offset) -		goto out; +		return 1; -	ret = 0; -out: -	return ret; +	return 0;  } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, -			  u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, +			  u64 bytenr, struct btrfs_path *path)  {  	int ret;  	do { -		ret = check_committed_ref(root, path, objectid, -					  offset, bytenr, strict); +		ret = check_committed_ref(inode, path, offset, bytenr);  		if (ret && ret != -ENOENT)  			goto out; -		ret = check_delayed_ref(root, path, objectid, offset, bytenr); +		/* +		 * The path must have a locked leaf from the extent tree where +		 * the extent item for our extent is located, in case it exists, +		 * or where it should be located in case it doesn't exist yet +		 * because it's new and its delayed ref was not yet flushed. +		 * We need to lock the delayed ref head at check_delayed_ref(), +		 * if one exists, while holding the leaf locked in order to not +		 * race with delayed ref flushing, missing references and +		 * incorrectly reporting that the extent is not shared. +		 */ +		if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { +			struct extent_buffer *leaf = path->nodes[0]; + +			ASSERT(leaf != NULL); +			btrfs_assert_tree_read_locked(leaf); + +			if (ret != -ENOENT) { +				struct btrfs_key key; + +				btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +				ASSERT(key.objectid == bytenr); +				ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); +			} +		} + +		ret = check_delayed_ref(inode, path, offset, bytenr);  	} while (ret == -EAGAIN && !path->nowait);  out:  	btrfs_release_path(path); -	if (btrfs_is_data_reloc_root(root)) +	if (btrfs_is_data_reloc_root(inode->root))  		WARN_ON(ret > 0);  	return ret;  } @@ -2571,13 +2593,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,  			   struct btrfs_block_group *cache,  			   u64 bytenr, u64 num_bytes, int reserved)  { -	struct btrfs_fs_info *fs_info = cache->fs_info; -  	spin_lock(&cache->space_info->lock);  	spin_lock(&cache->lock);  	cache->pinned += num_bytes; -	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, -					     num_bytes); +	btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);  	if (reserved) {  		cache->reserved -= num_bytes;  		cache->space_info->bytes_reserved -= num_bytes; @@ -2724,15 +2743,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,  {  	struct btrfs_block_group *cache = NULL;  	struct btrfs_space_info *space_info; -	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;  	struct btrfs_free_cluster *cluster = NULL; -	u64 len;  	u64 total_unpinned = 0;  	u64 empty_cluster = 0;  	bool readonly;  	int ret = 0;  	while (start <= end) { +		u64 len; +  		readonly = false;  		if (!cache ||  		    start >= cache->start + cache->length) { @@ -2778,37 +2797,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,  		spin_lock(&space_info->lock);  		spin_lock(&cache->lock);  		cache->pinned -= len; -		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); +		btrfs_space_info_update_bytes_pinned(space_info, -len);  		space_info->max_extent_size = 0;  		if (cache->ro) {  			space_info->bytes_readonly += len;  			readonly = true;  		} else if (btrfs_is_zoned(fs_info)) {  			/* Need reset before reusing in a zoned block group */ -			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, -								    len); +			btrfs_space_info_update_bytes_zone_unusable(space_info, len);  			readonly = true;  		}  		spin_unlock(&cache->lock); -		if (!readonly && return_free_space && -		    global_rsv->space_info == space_info) { -			spin_lock(&global_rsv->lock); -			if (!global_rsv->full) { -				u64 to_add = min(len, global_rsv->size - -						      global_rsv->reserved); - -				global_rsv->reserved += to_add; -				btrfs_space_info_update_bytes_may_use(fs_info, -						space_info, to_add); -				if (global_rsv->reserved >= global_rsv->size) -					global_rsv->full = 1; -				len -= to_add; -			} -			spin_unlock(&global_rsv->lock); -		} -		/* Add to any tickets we may have */ -		if (!readonly && return_free_space && len) -			btrfs_try_granting_tickets(fs_info, space_info); +		if (!readonly && return_free_space) +			btrfs_return_free_space(space_info, len);  		spin_unlock(&space_info->lock);  	} @@ -3259,7 +3260,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  		} else {  			btrfs_set_extent_refs(leaf, ei, refs); -			btrfs_mark_buffer_dirty(trans, leaf);  		}  		if (found_extent) {  			ret = remove_extent_backref(trans, extent_root, path, @@ -4827,7 +4827,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);  	} -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	btrfs_free_path(path);  	return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4902,7 +4901,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);  	} -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_free_path(path);  	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2ad51130c037..cfa52264f678 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,  int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,  				    const struct extent_buffer *eb);  int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, -			  u64 objectid, u64 offset, u64 bytenr, bool strict, +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,  			  struct btrfs_path *path);  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,  					     struct btrfs_root *root, @@ -163,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  			struct btrfs_root *root,  			struct extent_buffer *node,  			struct extent_buffer *parent); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, +			 u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);  #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c..d9f856358704 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping,  				    u64 end, unsigned long page_ops)  {  	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); -	pgoff_t start_index = start >> PAGE_SHIFT; +	pgoff_t index = start >> PAGE_SHIFT;  	pgoff_t end_index = end >> PAGE_SHIFT; -	pgoff_t index = start_index;  	struct folio_batch fbatch;  	int i; @@ -221,7 +220,7 @@ static void __process_folios_contig(struct address_space *mapping,  	}  } -static noinline void __unlock_for_delalloc(const struct inode *inode, +static noinline void unlock_delalloc_folio(const struct inode *inode,  					   const struct folio *locked_folio,  					   u64 start, u64 end)  { @@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode,  {  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);  	struct address_space *mapping = inode->i_mapping; -	pgoff_t start_index = start >> PAGE_SHIFT; +	pgoff_t index = start >> PAGE_SHIFT;  	pgoff_t end_index = end >> PAGE_SHIFT; -	pgoff_t index = start_index;  	u64 processed_end = start;  	struct folio_batch fbatch; @@ -288,8 +286,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,  out:  	folio_batch_release(&fbatch);  	if (processed_end > start) -		__unlock_for_delalloc(inode, locked_folio, start, -				      processed_end); +		unlock_delalloc_folio(inode, locked_folio, start, processed_end);  	return -EAGAIN;  } @@ -390,7 +387,7 @@ again:  	unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);  	if (!ret) { -		__unlock_for_delalloc(inode, locked_folio, delalloc_start, +		unlock_delalloc_folio(inode, locked_folio, delalloc_start,  				      delalloc_end);  		cond_resched();  		goto again; @@ -710,6 +707,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,  	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,  			       bio_ctrl->end_io_func, NULL);  	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; +	bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;  	bbio->inode = inode;  	bbio->file_offset = file_offset;  	bio_ctrl->bbio = bbio; @@ -862,11 +860,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,  	return ret;  } -int set_page_extent_mapped(struct page *page) -{ -	return set_folio_extent_mapped(page_folio(page)); -} -  int set_folio_extent_mapped(struct folio *folio)  {  	struct btrfs_fs_info *fs_info; @@ -901,9 +894,9 @@ void clear_folio_extent_mapped(struct folio *folio)  	folio_detach_private(folio);  } -static struct extent_map *__get_extent_map(struct inode *inode, -					   struct folio *folio, u64 start, -					   u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, +					 struct folio *folio, u64 start, +					 u64 len, struct extent_map **em_cached)  {  	struct extent_map *em;  	struct extent_state *cached_state = NULL; @@ -922,14 +915,14 @@ static struct extent_map *__get_extent_map(struct inode *inode,  		*em_cached = NULL;  	} -	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); -	em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); +	btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); +	em = btrfs_get_extent(inode, folio, start, len);  	if (!IS_ERR(em)) {  		BUG_ON(*em_cached);  		refcount_inc(&em->refs);  		*em_cached = em;  	} -	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); +	unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state);  	return em;  } @@ -985,8 +978,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,  			end_folio_read(folio, true, cur, iosize);  			break;  		} -		em = __get_extent_map(inode, folio, cur, end - cur + 1, -				      em_cached); +		em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);  		if (IS_ERR(em)) {  			end_folio_read(folio, false, cur, end + 1 - cur);  			return PTR_ERR(em); @@ -1142,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,  }  /* - * helper for extent_writepage(), doing all of the delayed allocation setup. + * Do all of the delayed allocation setup.   * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent).  In this case the IO has - * been started and the page is already unlocked. + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked).   * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. + * + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked.   */  static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  						 struct folio *folio, @@ -1167,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  	 * last delalloc end.  	 */  	u64 last_delalloc_end = 0; +	/* +	 * The range end (exclusive) of the last successfully finished delalloc +	 * range. +	 * Any range covered by ordered extent must either be manually marked +	 * finished (error handling), or has IO submitted (and finish the +	 * ordered extent normally). +	 * +	 * This records the end of ordered extent cleanup if we hit an error. +	 */ +	u64 last_finished_delalloc_end = page_start;  	u64 delalloc_start = page_start;  	u64 delalloc_end = page_end;  	u64 delalloc_to_write = 0; @@ -1235,11 +1242,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  			found_len = last_delalloc_end + 1 - found_start;  		if (ret >= 0) { +			/* +			 * Some delalloc range may be created by previous folios. +			 * Thus we still need to clean up this range during error +			 * handling. +			 */ +			last_finished_delalloc_end = found_start;  			/* No errors hit so far, run the current delalloc range. */  			ret = btrfs_run_delalloc_range(inode, folio,  						       found_start,  						       found_start + found_len - 1,  						       wbc); +			if (ret >= 0) +				last_finished_delalloc_end = found_start + found_len; +			if (unlikely(ret < 0)) +				btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", +					     btrfs_root_id(inode->root), +					     btrfs_ino(inode), +					     folio_pos(folio), +					     fs_info->sectors_per_page, +					     &bio_ctrl->submit_bitmap, +					     found_start, found_len, ret);  		} else {  			/*  			 * We've hit an error during previous delalloc range, @@ -1247,7 +1271,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  			 */  			unlock_extent(&inode->io_tree, found_start,  				      found_start + found_len - 1, NULL); -			__unlock_for_delalloc(&inode->vfs_inode, folio, +			unlock_delalloc_folio(&inode->vfs_inode, folio,  					      found_start,  					      found_start + found_len - 1);  		} @@ -1274,8 +1298,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  		delalloc_start = found_start + found_len;  	} -	if (ret < 0) +	/* +	 * It's possible we had some ordered extents created before we hit +	 * an error, cleanup non-async successfully created delalloc ranges. +	 */ +	if (unlikely(ret < 0)) { +		unsigned int bitmap_size = min( +				(last_finished_delalloc_end - page_start) >> +				fs_info->sectorsize_bits, +				fs_info->sectors_per_page); + +		for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) +			btrfs_mark_ordered_io_finished(inode, folio, +				page_start + (bit << fs_info->sectorsize_bits), +				fs_info->sectorsize, false);  		return ret; +	}  out:  	if (last_delalloc_end)  		delalloc_end = last_delalloc_end; @@ -1335,7 +1373,7 @@ static int submit_one_sector(struct btrfs_inode *inode,  	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);  	if (IS_ERR(em)) -		return PTR_ERR_OR_ZERO(em); +		return PTR_ERR(em);  	extent_offset = filepos - em->start;  	em_end = extent_map_end(em); @@ -1391,6 +1429,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,  	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	unsigned long range_bitmap = 0;  	bool submitted_io = false; +	bool error = false;  	const u64 folio_start = folio_pos(folio);  	u64 cur;  	int bit; @@ -1433,11 +1472,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,  			break;  		}  		ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); -		if (ret < 0) -			goto out; +		if (unlikely(ret < 0)) { +			/* +			 * bio_ctrl may contain a bio crossing several folios. +			 * Submit it immediately so that the bio has a chance +			 * to finish normally, other than marked as error. +			 */ +			submit_one_bio(bio_ctrl); +			/* +			 * Failed to grab the extent map which should be very rare. +			 * Since there is no bio submitted to finish the ordered +			 * extent, we have to manually finish this sector. +			 */ +			btrfs_mark_ordered_io_finished(inode, folio, cur, +						       fs_info->sectorsize, false); +			error = true; +			continue; +		}  		submitted_io = true;  	} -out: +  	/*  	 * If we didn't submitted any sector (>= i_size), folio dirty get  	 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared @@ -1445,8 +1499,11 @@ out:  	 *  	 * Here we set writeback and clear for the range. If the full folio  	 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. +	 * +	 * If we hit any error, the corresponding sector will still be dirty +	 * thus no need to clear PAGECACHE_TAG_DIRTY.  	 */ -	if (!submitted_io) { +	if (!submitted_io && !error) {  		btrfs_folio_set_writeback(fs_info, folio, start, len);  		btrfs_folio_clear_writeback(fs_info, folio, start, len);  	} @@ -1464,15 +1521,14 @@ out:   */  static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)  { -	struct inode *inode = folio->mapping->host; -	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); -	const u64 page_start = folio_pos(folio); +	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); +	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	int ret;  	size_t pg_offset; -	loff_t i_size = i_size_read(inode); +	loff_t i_size = i_size_read(&inode->vfs_inode);  	unsigned long end_index = i_size >> PAGE_SHIFT; -	trace_extent_writepage(folio, inode, bio_ctrl->wbc); +	trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);  	WARN_ON(!folio_test_locked(folio)); @@ -1496,26 +1552,28 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl  	if (ret < 0)  		goto done; -	ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); +	ret = writepage_delalloc(inode, folio, bio_ctrl);  	if (ret == 1)  		return 0;  	if (ret)  		goto done; -	ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), +	ret = extent_writepage_io(inode, folio, folio_pos(folio),  				  PAGE_SIZE, bio_ctrl, i_size);  	if (ret == 1)  		return 0; +	if (ret < 0) +		btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", +			     btrfs_root_id(inode->root), btrfs_ino(inode), +			     folio_pos(folio), fs_info->sectors_per_page, +			     &bio_ctrl->submit_bitmap, ret);  	bio_ctrl->wbc->nr_to_write--;  done: -	if (ret) { -		btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, -					       page_start, PAGE_SIZE, !ret); +	if (ret < 0)  		mapping_set_error(folio->mapping, ret); -	} -  	/*  	 * Only unlock ranges that are submitted. As there can be some async  	 * submitted ranges inside the folio. @@ -1525,12 +1583,6 @@ done:  	return ret;  } -void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ -	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, -		       TASK_UNINTERRUPTIBLE); -} -  /*   * Lock extent buffer status and pages for writeback.   * @@ -1671,11 +1723,10 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio)  {  	struct extent_buffer *eb = bbio->private;  	struct btrfs_fs_info *fs_info = eb->fs_info; -	bool uptodate = !bbio->bio.bi_status;  	struct folio_iter fi;  	u32 bio_offset = 0; -	if (!uptodate) +	if (bbio->bio.bi_status != BLK_STS_OK)  		set_btree_ioerr(eb);  	bio_for_each_folio_all(fi, &bbio->bio) { @@ -2292,11 +2343,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f  		if (ret == 1)  			goto next_page; -		if (ret) { -			btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, -						       cur, cur_len, !ret); +		if (ret)  			mapping_set_error(mapping, ret); -		}  		btrfs_folio_end_lock(fs_info, folio, cur, cur_len);  		if (ret < 0)  			found_error = true; @@ -2495,11 +2543,6 @@ next:  	return try_release_extent_state(io_tree, folio);  } -static void __free_extent_buffer(struct extent_buffer *eb) -{ -	kmem_cache_free(extent_buffer_cache, eb); -} -  static int extent_buffer_under_io(const struct extent_buffer *eb)  {  	return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -2580,8 +2623,8 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo  	spin_unlock(&folio->mapping->i_private_lock);  } -/* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) +/* Release all folios attached to the extent buffer */ +static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)  {  	ASSERT(!extent_buffer_under_io(eb)); @@ -2603,9 +2646,9 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)   */  static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)  { -	btrfs_release_extent_buffer_pages(eb); +	btrfs_release_extent_buffer_folios(eb);  	btrfs_leak_debug_del_eb(eb); -	__free_extent_buffer(eb); +	kmem_cache_free(extent_buffer_cache, eb);  }  static struct extent_buffer * @@ -2703,7 +2746,7 @@ err:  			folio_put(eb->folios[i]);  		}  	} -	__free_extent_buffer(eb); +	kmem_cache_free(extent_buffer_cache, eb);  	return NULL;  } @@ -2830,13 +2873,12 @@ free_eb:  }  #endif -static struct extent_buffer *grab_extent_buffer( -		struct btrfs_fs_info *fs_info, struct page *page) +static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, +						struct folio *folio)  { -	struct folio *folio = page_folio(page);  	struct extent_buffer *exists; -	lockdep_assert_held(&page->mapping->i_private_lock); +	lockdep_assert_held(&folio->mapping->i_private_lock);  	/*  	 * For subpage case, we completely rely on radix tree to ensure we @@ -2851,7 +2893,7 @@ static struct extent_buffer *grab_extent_buffer(  		return NULL;  	/* -	 * We could have already allocated an eb for this page and attached one +	 * We could have already allocated an eb for this folio and attached one  	 * so lets see if we can get a ref on the existing eb, and if we can we  	 * know it's good and we can just return that one, else we know we can  	 * just overwrite folio private. @@ -2860,16 +2902,19 @@ static struct extent_buffer *grab_extent_buffer(  	if (atomic_inc_not_zero(&exists->refs))  		return exists; -	WARN_ON(PageDirty(page)); +	WARN_ON(folio_test_dirty(folio));  	folio_detach_private(folio);  	return NULL;  } -static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +/* + * Validate alignment constraints of eb at logical address @start. + */ +static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)  {  	if (!IS_ALIGNED(start, fs_info->sectorsize)) {  		btrfs_err(fs_info, "bad tree block start %llu", start); -		return -EINVAL; +		return true;  	}  	if (fs_info->nodesize < PAGE_SIZE && @@ -2877,14 +2922,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)  		btrfs_err(fs_info,  		"tree block crosses page boundary, start %llu nodesize %u",  			  start, fs_info->nodesize); -		return -EINVAL; +		return true;  	}  	if (fs_info->nodesize >= PAGE_SIZE &&  	    !PAGE_ALIGNED(start)) {  		btrfs_err(fs_info,  		"tree block is not page aligned, start %llu nodesize %u",  			  start, fs_info->nodesize); -		return -EINVAL; +		return true;  	}  	if (!IS_ALIGNED(start, fs_info->nodesize) &&  	    !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { @@ -2892,10 +2937,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)  "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",  			      start, fs_info->nodesize);  	} -	return 0; +	return false;  } -  /*   * Return 0 if eb->folios[i] is attached to btree inode successfully.   * Return >0 if there is already another extent buffer for the range, @@ -2951,8 +2995,7 @@ finish:  	} else if (existing_folio) {  		struct extent_buffer *existing_eb; -		existing_eb = grab_extent_buffer(fs_info, -						 folio_page(existing_folio, 0)); +		existing_eb = grab_extent_buffer(fs_info, existing_folio);  		if (existing_eb) {  			/* The extent buffer still exists, we can use it directly. */  			*found_eb_ret = existing_eb; @@ -3149,7 +3192,7 @@ again:  	 * live buffer and won't free them prematurely.  	 */  	for (int i = 0; i < num_folios; i++) -		unlock_page(folio_page(eb->folios[i], 0)); +		folio_unlock(eb->folios[i]);  	return eb;  out: @@ -3173,7 +3216,7 @@ out:  	for (int i = 0; i < attached; i++) {  		ASSERT(eb->folios[i]);  		detach_extent_buffer_folio(eb, eb->folios[i]); -		unlock_page(folio_page(eb->folios[i], 0)); +		folio_unlock(eb->folios[i]);  		folio_put(eb->folios[i]);  		eb->folios[i] = NULL;  	} @@ -3195,7 +3238,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)  	struct extent_buffer *eb =  			container_of(head, struct extent_buffer, rcu_head); -	__free_extent_buffer(eb); +	kmem_cache_free(extent_buffer_cache, eb);  }  static int release_extent_buffer(struct extent_buffer *eb) @@ -3219,11 +3262,11 @@ static int release_extent_buffer(struct extent_buffer *eb)  		}  		btrfs_leak_debug_del_eb(eb); -		/* Should be safe to release our pages at this point */ -		btrfs_release_extent_buffer_pages(eb); +		/* Should be safe to release folios at this point. */ +		btrfs_release_extent_buffer_folios(eb);  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { -			__free_extent_buffer(eb); +			kmem_cache_free(extent_buffer_cache, eb);  			return 1;  		}  #endif @@ -3382,12 +3425,12 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)  		 * the above race.  		 */  		if (subpage) -			lock_page(folio_page(eb->folios[0], 0)); +			folio_lock(eb->folios[0]);  		for (int i = 0; i < num_folios; i++)  			btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],  					      eb->start, eb->len);  		if (subpage) -			unlock_page(folio_page(eb->folios[0], 0)); +			folio_unlock(eb->folios[0]);  		percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,  					 eb->len,  					 eb->fs_info->dirty_metadata_batch); @@ -3497,8 +3540,8 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)  	bio_put(&bbio->bio);  } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, -			     const struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, +				    const struct btrfs_tree_parent_check *check)  {  	struct btrfs_bio *bbio;  	bool ret; @@ -3516,7 +3559,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,  	/* Someone else is already reading the buffer, just wait for it. */  	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) -		goto done; +		return 0;  	/*  	 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above @@ -3556,14 +3599,21 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,  		}  	}  	btrfs_submit_bbio(bbio, mirror_num); +	return 0; +} -done: -	if (wait == WAIT_COMPLETE) { -		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); -		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) -			return -EIO; -	} +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, +			     const struct btrfs_tree_parent_check *check) +{ +	int ret; +	ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); +	if (ret < 0) +		return ret; + +	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) +		return -EIO;  	return 0;  } @@ -4294,7 +4344,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,  		return;  	} -	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); +	ret = read_extent_buffer_pages_nowait(eb, 0, &check);  	if (ret < 0)  		free_extent_buffer_stale(eb);  	else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8a36117ed453..6c5328bfabc2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,7 +248,6 @@ int btree_write_cache_pages(struct address_space *mapping,  			    struct writeback_control *wbc);  void btrfs_readahead(struct readahead_control *rac);  int set_folio_extent_mapped(struct folio *folio); -int set_page_extent_mapped(struct page *page);  void clear_folio_extent_mapped(struct folio *folio);  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -262,12 +261,17 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,  					 u64 start);  void free_extent_buffer(struct extent_buffer *eb);  void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE	0 -#define WAIT_COMPLETE	1 -#define WAIT_PAGE_LOCK	2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,  			     const struct btrfs_tree_parent_check *parent_check); -void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, +				    const struct btrfs_tree_parent_check *parent_check); + +static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ +	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, +		       TASK_UNINTERRUPTIBLE); +} +  void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,  				u64 bytenr, u64 owner_root, u64 gen, int level);  void btrfs_readahead_node_child(struct extent_buffer *node, int slot); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 886749b39672..d04a3b47b1fb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -190,8 +190,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_compression(leaf, item, 0);  	btrfs_set_file_extent_encryption(leaf, item, 0);  	btrfs_set_file_extent_other_encoding(leaf, item, 0); - -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	return ret; @@ -1259,7 +1257,6 @@ found:  	ins_size /= csum_size;  	total_bytes += ins_size * fs_info->sectorsize; -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	if (total_bytes < sums->len) {  		btrfs_release_path(path);  		cond_resched(); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 14e27473c5bc..36f51c311bb1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,52 +36,7 @@  #include "ioctl.h"  #include "file.h"  #include "super.h" - -/* - * Helper to fault in page and copy.  This should go away and be replaced with - * calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, -					 struct folio *folio, struct iov_iter *i) -{ -	size_t copied = 0; -	size_t total_copied = 0; -	int offset = offset_in_page(pos); - -	while (write_bytes > 0) { -		size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); -		/* -		 * Copy data from userspace to the current page -		 */ -		copied = copy_folio_from_iter_atomic(folio, offset, count, i); - -		/* Flush processor's dcache for this page */ -		flush_dcache_folio(folio); - -		/* -		 * if we get a partial write, we can end up with -		 * partially up to date page.  These add -		 * a lot of complexity, so make sure they don't -		 * happen by forcing this copy to be retried. -		 * -		 * The rest of the btrfs_file_write code will fall -		 * back to page at a time copies after we return 0. -		 */ -		if (unlikely(copied < count)) { -			if (!folio_test_uptodate(folio)) { -				iov_iter_revert(i, copied); -				copied = 0; -			} -			if (!copied) -				break; -		} - -		write_bytes -= copied; -		total_copied += copied; -		offset += copied; -	} -	return total_copied; -} +#include "print-tree.h"  /*   * Unlock folio after btrfs_file_write() is done with it. @@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,  }  /* - * After btrfs_copy_from_user(), update the following things for delalloc: + * After copy_folio_from_iter_atomic(), update the following things for delalloc:   * - Mark newly dirtied folio as DELALLOC in the io tree.   *   Used to advise which range is to be written back.   * - Mark modified folio as Uptodate/Dirty and not needing COW fixup @@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,  	if (args->drop_cache)  		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); -	if (args->start >= inode->disk_i_size && !args->replace_extent) +	if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)  		modify_tree = 0;  	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); @@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,  next_slot:  		leaf = path->nodes[0];  		if (path->slots[0] >= btrfs_header_nritems(leaf)) { -			BUG_ON(del_nr > 0); +			if (WARN_ON(del_nr > 0)) { +				btrfs_print_leaf(leaf); +				ret = -EINVAL; +				break; +			}  			ret = btrfs_next_leaf(root, path);  			if (ret < 0)  				break; @@ -321,7 +280,11 @@ next_slot:  		 *  | -------- extent -------- |  		 */  		if (args->start > key.offset && args->end < extent_end) { -			BUG_ON(del_nr > 0); +			if (WARN_ON(del_nr > 0)) { +				btrfs_print_leaf(leaf); +				ret = -EINVAL; +				break; +			}  			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  				ret = -EOPNOTSUPP;  				break; @@ -351,7 +314,6 @@ next_slot:  			btrfs_set_file_extent_offset(leaf, fi, extent_offset);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							extent_end - args->start); -			btrfs_mark_buffer_dirty(trans, leaf);  			if (update_refs && disk_bytenr > 0) {  				struct btrfs_ref ref = { @@ -397,7 +359,6 @@ next_slot:  			btrfs_set_file_extent_offset(leaf, fi, extent_offset);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							extent_end - args->end); -			btrfs_mark_buffer_dirty(trans, leaf);  			if (update_refs && disk_bytenr > 0)  				args->bytes_found += args->end - key.offset;  			break; @@ -409,7 +370,11 @@ next_slot:  		 *  | -------- extent -------- |  		 */  		if (args->start > key.offset && args->end >= extent_end) { -			BUG_ON(del_nr > 0); +			if (WARN_ON(del_nr > 0)) { +				btrfs_print_leaf(leaf); +				ret = -EINVAL; +				break; +			}  			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  				ret = -EOPNOTSUPP;  				break; @@ -417,7 +382,6 @@ next_slot:  			btrfs_set_file_extent_num_bytes(leaf, fi,  							args->start - key.offset); -			btrfs_mark_buffer_dirty(trans, leaf);  			if (update_refs && disk_bytenr > 0)  				args->bytes_found += extent_end - args->start;  			if (args->end == extent_end) @@ -437,7 +401,11 @@ delete_extent_item:  				del_slot = path->slots[0];  				del_nr = 1;  			} else { -				BUG_ON(del_slot + del_nr != path->slots[0]); +				if (WARN_ON(del_slot + del_nr != path->slots[0])) { +					btrfs_print_leaf(leaf); +					ret = -EINVAL; +					break; +				}  				del_nr++;  			} @@ -668,7 +636,6 @@ again:  							 trans->transid);  			btrfs_set_file_extent_num_bytes(leaf, fi,  							end - other_start); -			btrfs_mark_buffer_dirty(trans, leaf);  			goto out;  		}  	} @@ -697,7 +664,6 @@ again:  							other_end - start);  			btrfs_set_file_extent_offset(leaf, fi,  						     start - orig_offset); -			btrfs_mark_buffer_dirty(trans, leaf);  			goto out;  		}  	} @@ -731,7 +697,6 @@ again:  		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);  		btrfs_set_file_extent_num_bytes(leaf, fi,  						extent_end - split); -		btrfs_mark_buffer_dirty(trans, leaf);  		ref.action = BTRFS_ADD_DELAYED_REF;  		ref.bytenr = bytenr; @@ -810,7 +775,6 @@ again:  		btrfs_set_file_extent_type(leaf, fi,  					   BTRFS_FILE_EXTENT_REG);  		btrfs_set_file_extent_generation(leaf, fi, trans->transid); -		btrfs_mark_buffer_dirty(trans, leaf);  	} else {  		fi = btrfs_item_ptr(leaf, del_slot - 1,  			   struct btrfs_file_extent_item); @@ -819,7 +783,6 @@ again:  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);  		btrfs_set_file_extent_num_bytes(leaf, fi,  						extent_end - key.offset); -		btrfs_mark_buffer_dirty(trans, leaf);  		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);  		if (ret < 0) { @@ -1052,7 +1015,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,  						   &cached_state);  	}  	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, -			       NULL, nowait, false); +			       NULL, nowait);  	if (ret <= 0)  		btrfs_drew_write_unlock(&root->snapshot_lock);  	else @@ -1252,7 +1215,23 @@ again:  			break;  		} -		copied = btrfs_copy_from_user(pos, write_bytes, folio, i); +		copied = copy_folio_from_iter_atomic(folio, +				offset_in_folio(folio, pos), write_bytes, i); +		flush_dcache_folio(folio); + +		/* +		 * If we get a partial write, we can end up with partially +		 * uptodate page. Although if sector size < page size we can +		 * handle it, but if it's not sector aligned it can cause +		 * a lot of complexity, so make sure they don't happen by +		 * forcing retry this copy. +		 */ +		if (unlikely(copied < write_bytes)) { +			if (!folio_test_uptodate(folio)) { +				iov_iter_revert(i, copied); +				copied = 0; +			} +		}  		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);  		dirty_sectors = round_up(copied + sector_offset, @@ -2029,7 +2008,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,  		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);  		btrfs_set_file_extent_offset(leaf, fi, 0);  		btrfs_set_file_extent_generation(leaf, fi, trans->transid); -		btrfs_mark_buffer_dirty(trans, leaf);  		goto out;  	} @@ -2046,7 +2024,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,  		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);  		btrfs_set_file_extent_offset(leaf, fi, 0);  		btrfs_set_file_extent_generation(leaf, fi, trans->transid); -		btrfs_mark_buffer_dirty(trans, leaf);  		goto out;  	}  	btrfs_release_path(path); @@ -2194,7 +2171,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);  	if (extent_info->is_new_extent)  		btrfs_set_file_extent_generation(leaf, extent, trans->transid); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfa52ef40b06..d42b6f882f57 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -12,7 +12,7 @@  #include <linux/error-injection.h>  #include <linux/sched/mm.h>  #include <linux/string_choices.h> -#include "ctree.h" +#include "extent-tree.h"  #include "fs.h"  #include "messages.h"  #include "misc.h" @@ -198,7 +198,6 @@ static int __create_free_space_inode(struct btrfs_root *root,  	btrfs_set_inode_nlink(leaf, inode_item, 1);  	btrfs_set_inode_transid(leaf, inode_item, trans->transid);  	btrfs_set_inode_block_group(leaf, inode_item, offset); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root,  				struct btrfs_free_space_header);  	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));  	btrfs_set_free_space_key(leaf, header, &disk_key); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	return 0; @@ -463,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)  			return -ENOMEM;  		} -		ret = set_page_extent_mapped(page); +		ret = set_folio_extent_mapped(page_folio(page));  		if (ret < 0) {  			unlock_page(page);  			put_page(page); @@ -1189,7 +1187,6 @@ update_cache_item(struct btrfs_trans_handle *trans,  	btrfs_set_free_space_entries(leaf, header, entries);  	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);  	btrfs_set_free_space_generation(leaf, header, trans->transid); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	return 0; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7ba50e133921..cae540ec15ed 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,  			      struct btrfs_free_space_info);  	btrfs_set_free_space_extent_count(leaf, info, 0);  	btrfs_set_free_space_flags(leaf, info, 0); -	btrfs_mark_buffer_dirty(trans, leaf);  	ret = 0;  out: @@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,  	flags |= BTRFS_FREE_SPACE_USING_BITMAPS;  	btrfs_set_free_space_flags(leaf, info, flags);  	expected_extent_count = btrfs_free_space_extent_count(leaf, info); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	if (extent_count != expected_extent_count) { @@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);  		write_extent_buffer(leaf, bitmap_cursor, ptr,  				    data_size); -		btrfs_mark_buffer_dirty(trans, leaf);  		btrfs_release_path(path);  		i += extent_size; @@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,  	flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;  	btrfs_set_free_space_flags(leaf, info, flags);  	expected_extent_count = btrfs_free_space_extent_count(leaf, info); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,  	extent_count += new_extents;  	btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	btrfs_release_path(path);  	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -1350,6 +1345,12 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)  			btrfs_end_transaction(trans);  			return ret;  		} +		if (btrfs_should_end_transaction(trans)) { +			btrfs_end_transaction(trans); +			trans = btrfs_start_transaction(free_space_root, 1); +			if (IS_ERR(trans)) +				return PTR_ERR(trans); +		}  		node = rb_next(node);  	} diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 31c1648bc0b4..09cfb43580cb 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -4,6 +4,136 @@  #include "ctree.h"  #include "fs.h"  #include "accessors.h" +#include "volumes.h" + +static const struct btrfs_csums { +	u16		size; +	const char	name[10]; +	const char	driver[12]; +} btrfs_csums[] = { +	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, +	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, +	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, +	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", +				     .driver = "blake2b-256" }, +}; + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ +	return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ +	u16 t = btrfs_super_csum_type(s); + +	/* csum type is validated at mount time. */ +	return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ +	/* csum type is validated at mount time. */ +	return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name. + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ +	/* csum type is validated at mount time */ +	return btrfs_csums[csum_type].driver[0] ? +		btrfs_csums[csum_type].driver : +		btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ +	return ARRAY_SIZE(btrfs_csums); +} + +/* + * Start exclusive operation @type, return true on success. + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, +			enum btrfs_exclusive_operation type) +{ +	bool ret = false; + +	spin_lock(&fs_info->super_lock); +	if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { +		fs_info->exclusive_operation = type; +		ret = true; +	} +	spin_unlock(&fs_info->super_lock); + +	return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one.  This must be paired with btrfs_exclop_start_unlock() + * and btrfs_exclop_finish(). + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + *   must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, +				 enum btrfs_exclusive_operation type) +{ +	spin_lock(&fs_info->super_lock); +	if (fs_info->exclusive_operation == type || +	    (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && +	     type == BTRFS_EXCLOP_DEV_ADD)) +		return true; + +	spin_unlock(&fs_info->super_lock); +	return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ +	spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ +	spin_lock(&fs_info->super_lock); +	WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); +	spin_unlock(&fs_info->super_lock); +	sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, +			  enum btrfs_exclusive_operation op) +{ +	switch (op) { +	case BTRFS_EXCLOP_BALANCE_PAUSED: +		spin_lock(&fs_info->super_lock); +		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || +		       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || +		       fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || +		       fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); +		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; +		spin_unlock(&fs_info->super_lock); +		break; +	case BTRFS_EXCLOP_BALANCE: +		spin_lock(&fs_info->super_lock); +		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); +		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; +		spin_unlock(&fs_info->super_lock); +		break; +	default: +		btrfs_warn(fs_info, +			"invalid exclop balance operation %d requested", op); +	} +}  void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,  			     const char *name) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79a1a3d6f04d..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -14,10 +14,10 @@  #include <linux/lockdep.h>  #include <linux/spinlock.h>  #include <linux/mutex.h> -#include <linux/rwlock_types.h>  #include <linux/rwsem.h>  #include <linux/semaphore.h>  #include <linux/list.h> +#include <linux/pagemap.h>  #include <linux/radix-tree.h>  #include <linux/workqueue.h>  #include <linux/wait.h> @@ -627,6 +627,9 @@ struct btrfs_fs_info {  	struct kobject *qgroups_kobj;  	struct kobject *discard_kobj; +	/* Track the number of blocks (sectors) read by the filesystem. */ +	struct percpu_counter stats_read_blocks; +  	/* Used to keep from writing metadata until there is a nice batch */  	struct percpu_counter dirty_metadata_bytes;  	struct percpu_counter delalloc_bytes; @@ -887,6 +890,11 @@ struct btrfs_fs_info {  #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),			\  					   struct inode *: (_inode)))->root->fs_info) +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ +	return mapping_gfp_constraint(mapping, ~__GFP_FS); +} +  static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)  {  	return READ_ONCE(fs_info->generation); @@ -953,6 +961,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,  #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \  					sizeof(struct btrfs_item)) +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) +  static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)  {  	return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; @@ -982,6 +992,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,  int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +static inline bool btrfs_is_empty_uuid(const u8 *uuid) +{ +	return uuid_is_null((const uuid_t *)uuid); +} +  /* Compatibility and incompatibility defines */  void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,  			     const char *name); @@ -1058,6 +1079,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)  	(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,		\  			   &(fs_info)->fs_state))) +/* + * We use folio flag owner_2 to indicate there is an ordered extent with + * unfinished IO. + */ +#define folio_test_ordered(folio)	folio_test_owner_2(folio) +#define folio_set_ordered(folio)	folio_set_owner_2(folio) +#define folio_clear_ordered(folio)	folio_clear_owner_2(folio) +  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  #define EXPORT_FOR_TESTS diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 29572dfaf878..448aa1a682d6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -298,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,  	ptr = (unsigned long)&extref->name;  	write_extent_buffer(path->nodes[0], name->name, ptr, name->len); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]); -  out:  	btrfs_free_path(path);  	return ret; @@ -363,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,  		ptr = (unsigned long)(ref + 1);  	}  	write_extent_buffer(path->nodes[0], name->name, ptr, name->len); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]); -  out:  	btrfs_free_path(path); @@ -590,7 +586,6 @@ search_again:  				num_dec = (orig_num_bytes - extent_num_bytes);  				if (extent_start != 0)  					control->sub_bytes += num_dec; -				btrfs_mark_buffer_dirty(trans, leaf);  			} else {  				extent_num_bytes =  					btrfs_file_extent_disk_num_bytes(leaf, fi); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27b2fe7f735d..fe2c810335ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)   * extent (btrfs_finish_ordered_io()).   */  static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, -						 struct folio *locked_folio,  						 u64 offset, u64 bytes)  {  	unsigned long index = offset >> PAGE_SHIFT;  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; -	u64 page_start = 0, page_end = 0;  	struct folio *folio; -	if (locked_folio) { -		page_start = folio_pos(locked_folio); -		page_end = page_start + folio_size(locked_folio) - 1; -	} -  	while (index <= end_index) { -		/* -		 * For locked page, we will call btrfs_mark_ordered_io_finished -		 * through btrfs_mark_ordered_io_finished() on it -		 * in run_delalloc_range() for the error handling, which will -		 * clear page Ordered and run the ordered extent accounting. -		 * -		 * Here we can't just clear the Ordered bit, or -		 * btrfs_mark_ordered_io_finished() would skip the accounting -		 * for the page range, and the ordered extent will never finish. -		 */ -		if (locked_folio && index == (page_start >> PAGE_SHIFT)) { -			index++; -			continue; -		}  		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);  		index++;  		if (IS_ERR(folio)) @@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,  		folio_put(folio);  	} -	if (locked_folio) { -		/* The locked page covers the full range, nothing needs to be done */ -		if (bytes + offset <= page_start + folio_size(locked_folio)) -			return; -		/* -		 * In case this page belongs to the delalloc range being -		 * instantiated then skip it, since the first page of a range is -		 * going to be properly cleaned up by the caller of -		 * run_delalloc_range -		 */ -		if (page_start >= offset && page_end <= (offset + bytes - 1)) { -			bytes = offset + bytes - folio_pos(locked_folio) - -				folio_size(locked_folio); -			offset = folio_pos(locked_folio) + folio_size(locked_folio); -		} -	} -  	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);  } @@ -564,7 +526,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,  		kunmap_local(kaddr);  		folio_put(folio);  	} -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	/* @@ -1129,19 +1090,14 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,  			       &wbc, false);  	wbc_detach_inode(&wbc);  	if (ret < 0) { -		btrfs_cleanup_ordered_extents(inode, locked_folio, -					      start, end - start + 1); -		if (locked_folio) { -			const u64 page_start = folio_pos(locked_folio); - -			folio_start_writeback(locked_folio); -			folio_end_writeback(locked_folio); -			btrfs_mark_ordered_io_finished(inode, locked_folio, -						       page_start, PAGE_SIZE, -						       !ret); -			mapping_set_error(locked_folio->mapping, ret); -			folio_unlock(locked_folio); -		} +		btrfs_cleanup_ordered_extents(inode, start, end - start + 1); +		if (locked_folio) +			btrfs_folio_end_lock(inode->root->fs_info, locked_folio, +					     start, async_extent->ram_size); +		btrfs_err_rl(inode->root->fs_info, +			"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", +			     __func__, btrfs_root_id(inode->root), +			     btrfs_ino(inode), start, async_extent->ram_size, ret);  	}  } @@ -1373,6 +1329,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);  	/* +	 * We're not doing compressed IO, don't unlock the first page (which +	 * the caller expects to stay locked), don't clear any dirty bits and +	 * don't set any writeback bits. +	 * +	 * Do set the Ordered (Private2) bit so we know this page was properly +	 * setup for writepage. +	 */ +	page_ops = (keep_locked ? 0 : PAGE_UNLOCK); +	page_ops |= PAGE_SET_ORDERED; + +	/*  	 * Relocation relies on the relocated extents to have exactly the same  	 * size as the original extents. Normally writeback for relocation data  	 * extents follows a NOCOW path because relocation preallocates the @@ -1431,6 +1398,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		file_extent.offset = 0;  		file_extent.compression = BTRFS_COMPRESS_NONE; +		/* +		 * Locked range will be released either during error clean up or +		 * after the whole range is finished. +		 */  		lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,  			    &cached); @@ -1476,21 +1447,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		btrfs_dec_block_group_reservations(fs_info, ins.objectid); -		/* -		 * We're not doing compressed IO, don't unlock the first page -		 * (which the caller expects to stay locked), don't clear any -		 * dirty bits and don't set any writeback bits -		 * -		 * Do set the Ordered flag so we know this page was -		 * properly setup for writepage. -		 */ -		page_ops = (keep_locked ? 0 : PAGE_UNLOCK); -		page_ops |= PAGE_SET_ORDERED; - -		extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, -					     locked_folio, &cached, -					     EXTENT_LOCKED | EXTENT_DELALLOC, -					     page_ops);  		if (num_bytes < cur_alloc_size)  			num_bytes = 0;  		else @@ -1507,6 +1463,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		if (ret)  			goto out_unlock;  	} +	extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, +				     EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);  done:  	if (done_offset)  		*done_offset = end; @@ -1527,35 +1485,30 @@ out_unlock:  	 * We process each region below.  	 */ -	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | -		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; -	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; -  	/*  	 * For the range (1). We have already instantiated the ordered extents  	 * for this region. They are cleaned up by  	 * btrfs_cleanup_ordered_extents() in e.g, -	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are -	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | -	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup -	 * function. +	 * btrfs_run_delalloc_range(). +	 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV +	 * are also handled by the cleanup function.  	 * -	 * However, in case of @keep_locked, we still need to unlock the pages -	 * (except @locked_folio) to ensure all the pages are unlocked. +	 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and +	 * finish the writeback of the involved folios, which will be never submitted.  	 */ -	if (keep_locked && orig_start < start) { +	if (orig_start < start) { +		clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; +		page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; +  		if (!locked_folio)  			mapping_set_error(inode->vfs_inode.i_mapping, ret);  		extent_clear_unlock_delalloc(inode, orig_start, start - 1, -					     locked_folio, NULL, 0, page_ops); +					     locked_folio, NULL, clear_bits, page_ops);  	} -	/* -	 * At this point we're unlocked, we want to make sure we're only -	 * clearing these flags under the extent lock, so lock the rest of the -	 * range and clear everything up. -	 */ -	lock_extent(&inode->io_tree, start, end, NULL); +	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | +		     EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; +	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;  	/*  	 * For the range (2). If we reserved an extent for our delalloc range @@ -1589,6 +1542,10 @@ out_unlock:  		btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,  				       end - start - cur_alloc_size + 1, NULL);  	} +	btrfs_err_rl(fs_info, +		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", +		     __func__, btrfs_root_id(inode->root), +		     btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);  	return ret;  } @@ -1809,7 +1766,7 @@ static int fallback_to_cow(struct btrfs_inode *inode,  			bytes = range_bytes;  		spin_lock(&sinfo->lock); -		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); +		btrfs_space_info_update_bytes_may_use(sinfo, bytes);  		spin_unlock(&sinfo->lock);  		if (count > 0) @@ -1837,7 +1794,6 @@ struct can_nocow_file_extent_args {  	/* End file offset (inclusive) of the range we want to NOCOW. */  	u64 end;  	bool writeback_path; -	bool strict;  	/*  	 * Free the path passed to can_nocow_file_extent() once it's not needed  	 * anymore. @@ -1892,8 +1848,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	 * for its subvolume was created, then this implies the extent is shared,  	 * hence we must COW.  	 */ -	if (!args->strict && -	    btrfs_file_extent_generation(leaf, fi) <= +	if (btrfs_file_extent_generation(leaf, fi) <=  	    btrfs_root_last_snapshot(&root->root_item))  		goto out; @@ -1922,9 +1877,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	 */  	btrfs_release_path(path); -	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), -				    key->offset - args->file_extent.offset, -				    args->file_extent.disk_bytenr, args->strict, path); +	ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, +				    args->file_extent.disk_bytenr, path);  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);  	if (ret != 0)  		goto out; @@ -1971,6 +1925,53 @@ static int can_nocow_file_extent(struct btrfs_path *path,  }  /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, +				 struct folio *locked_folio, +				 u64 start, u64 end, int error) +{ +	struct btrfs_fs_info *fs_info = inode->root->fs_info; +	struct address_space *mapping = inode->vfs_inode.i_mapping; +	pgoff_t start_index = start >> PAGE_SHIFT; +	pgoff_t end_index = end >> PAGE_SHIFT; +	u32 len; + +	ASSERT(end + 1 - start < U32_MAX); +	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && +	       IS_ALIGNED(end + 1, fs_info->sectorsize)); +	len = end + 1 - start; + +	/* +	 * Handle the locked folio first. +	 * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. +	 */ +	btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + +	for (pgoff_t index = start_index; index <= end_index; index++) { +		struct folio *folio; + +		/* Already handled at the beginning. */ +		if (index == locked_folio->index) +			continue; +		folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); +		/* Cache already dropped, no need to do any cleanup. */ +		if (IS_ERR(folio)) +			continue; +		btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); +		folio_unlock(folio); +		folio_put(folio); +	} +	mapping_set_error(mapping, error); +} + +/*   * when nowcow writeback call back.  This checks for snapshots or COW copies   * of the extents that exist in the file, and COWs the file as required.   * @@ -1985,6 +1986,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,  	struct btrfs_root *root = inode->root;  	struct btrfs_path *path;  	u64 cow_start = (u64)-1; +	/* +	 * If not 0, represents the inclusive end of the last fallback_to_cow() +	 * range. Only for error handling. +	 */ +	u64 cow_end = 0;  	u64 cur_offset = start;  	int ret;  	bool check_prev = true; @@ -2145,6 +2151,7 @@ must_cow:  					      found_key.offset - 1);  			cow_start = (u64)-1;  			if (ret) { +				cow_end = found_key.offset - 1;  				btrfs_dec_nocow_writers(nocow_bg);  				goto error;  			} @@ -2218,11 +2225,12 @@ must_cow:  		cow_start = cur_offset;  	if (cow_start != (u64)-1) { -		cur_offset = end;  		ret = fallback_to_cow(inode, locked_folio, cow_start, end);  		cow_start = (u64)-1; -		if (ret) +		if (ret) { +			cow_end = end;  			goto error; +		}  	}  	btrfs_free_path(path); @@ -2230,12 +2238,41 @@ must_cow:  error:  	/* +	 * There are several error cases: +	 * +	 * 1) Failed without falling back to COW +	 *    start         cur_offset             end +	 *    |/////////////|                      | +	 * +	 *    For range [start, cur_offset) the folios are already unlocked (except +	 *    @locked_folio), EXTENT_DELALLOC already removed. +	 *    Only need to clear the dirty flag as they will never be submitted. +	 *    Ordered extent and extent maps are handled by +	 *    btrfs_mark_ordered_io_finished() inside run_delalloc_range(). +	 * +	 * 2) Failed with error from fallback_to_cow() +	 *    start         cur_offset  cow_end    end +	 *    |/////////////|-----------|          | +	 * +	 *    For range [start, cur_offset) it's the same as case 1). +	 *    But for range [cur_offset, cow_end), the folios have dirty flag +	 *    cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). +	 * +	 *    Thus we should not call extent_clear_unlock_delalloc() on range +	 *    [cur_offset, cow_end), as the folios are already unlocked. +	 * +	 * So clear the folio dirty flags for [start, cur_offset) first. +	 */ +	if (cur_offset > start) +		cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + +	/*  	 * If an error happened while a COW region is outstanding, cur_offset -	 * needs to be reset to cow_start to ensure the COW region is unlocked -	 * as well. +	 * needs to be reset to @cow_end + 1 to skip the COW range, as +	 * cow_file_range() will do the proper cleanup at error.  	 */ -	if (cow_start != (u64)-1) -		cur_offset = cow_start; +	if (cow_end) +		cur_offset = cow_end + 1;  	/*  	 * We need to lock the extent here because we're clearing DELALLOC and @@ -2255,6 +2292,10 @@ error:  		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);  	}  	btrfs_free_path(path); +	btrfs_err_rl(fs_info, +		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", +		     __func__, btrfs_root_id(inode->root), +		     btrfs_ino(inode), start, end + 1 - start, ret);  	return ret;  } @@ -2305,8 +2346,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol  out:  	if (ret < 0) -		btrfs_cleanup_ordered_extents(inode, locked_folio, start, -					      end - start + 1); +		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);  	return ret;  } @@ -2921,7 +2961,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  			btrfs_item_ptr_offset(leaf, path->slots[0]),  			sizeof(struct btrfs_file_extent_item)); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path);  	/* @@ -4085,7 +4124,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,  				    struct btrfs_inode_item);  	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_set_inode_last_trans(trans, inode);  	ret = 0;  failed: @@ -6380,7 +6418,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  		}  	} -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	/*  	 * We don't need the path anymore, plus inheriting properties, adding  	 * ACLs, security xattrs, orphan item or adding the link, will result in @@ -7011,8 +7048,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)   * @orig_start:	(optional) Return the original file offset of the file extent   * @orig_len:	(optional) Return the original on-disk length of the file extent   * @ram_bytes:	(optional) Return the ram_bytes of the file extent - * @strict:	if true, omit optimizations that might force us into unnecessary - *		cow. e.g., don't trust generation number.   *   * Return:   * >0	and update @len if we can do nocow write @@ -7024,7 +7059,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)   */  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  			      struct btrfs_file_extent *file_extent, -			      bool nowait, bool strict) +			      bool nowait)  {  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);  	struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7077,7 +7112,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  	nocow_args.start = offset;  	nocow_args.end = offset + *len - 1; -	nocow_args.strict = strict;  	nocow_args.free_path = true;  	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); @@ -8027,31 +8061,45 @@ static int btrfs_rename_exchange(struct inode *old_dir,  	/* src is a subvolume */  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	} else { /* src is an inode */  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),  					   BTRFS_I(old_dentry->d_inode),  					   old_name, &old_rename_ctx); -		if (!ret) -			ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); -	} -	if (ret) { -		btrfs_abort_transaction(trans, ret); -		goto out_fail; +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		} +		ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	}  	/* dest is a subvolume */  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {  		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	} else { /* dest is an inode */  		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),  					   BTRFS_I(new_dentry->d_inode),  					   new_name, &new_rename_ctx); -		if (!ret) -			ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); -	} -	if (ret) { -		btrfs_abort_transaction(trans, ret); -		goto out_fail; +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		} +		ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	}  	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -8287,16 +8335,23 @@ static int btrfs_rename(struct mnt_idmap *idmap,  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	} else {  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),  					   BTRFS_I(d_inode(old_dentry)),  					   &old_fname.disk_name, &rename_ctx); -		if (!ret) -			ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); -	} -	if (ret) { -		btrfs_abort_transaction(trans, ret); -		goto out_fail; +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		} +		ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_fail; +		}  	}  	if (new_inode) { @@ -8304,18 +8359,27 @@ static int btrfs_rename(struct mnt_idmap *idmap,  		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {  			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); +			if (ret) { +				btrfs_abort_transaction(trans, ret); +				goto out_fail; +			}  			BUG_ON(new_inode->i_nlink == 0);  		} else {  			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),  						 BTRFS_I(d_inode(new_dentry)),  						 &new_fname.disk_name); +			if (ret) { +				btrfs_abort_transaction(trans, ret); +				goto out_fail; +			}  		} -		if (!ret && new_inode->i_nlink == 0) +		if (new_inode->i_nlink == 0) {  			ret = btrfs_orphan_add(trans,  					BTRFS_I(d_inode(new_dentry))); -		if (ret) { -			btrfs_abort_transaction(trans, ret); -			goto out_fail; +			if (ret) { +				btrfs_abort_transaction(trans, ret); +				goto out_fail; +			}  		}  	} @@ -8655,7 +8719,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,  	ptr = btrfs_file_extent_inline_start(ei);  	write_extent_buffer(leaf, symname, ptr, name_len); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_free_path(path);  	d_instantiate_new(dentry, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4d9305fa37a8..ae98269a5e3a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -403,86 +403,6 @@ update_flags:  	return ret;  } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, -			enum btrfs_exclusive_operation type) -{ -	bool ret = false; - -	spin_lock(&fs_info->super_lock); -	if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { -		fs_info->exclusive_operation = type; -		ret = true; -	} -	spin_unlock(&fs_info->super_lock); - -	return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one.  This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - *   must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, -				 enum btrfs_exclusive_operation type) -{ -	spin_lock(&fs_info->super_lock); -	if (fs_info->exclusive_operation == type || -	    (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && -	     type == BTRFS_EXCLOP_DEV_ADD)) -		return true; - -	spin_unlock(&fs_info->super_lock); -	return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ -	spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ -	spin_lock(&fs_info->super_lock); -	WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); -	spin_unlock(&fs_info->super_lock); -	sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, -			  enum btrfs_exclusive_operation op) -{ -	switch (op) { -	case BTRFS_EXCLOP_BALANCE_PAUSED: -		spin_lock(&fs_info->super_lock); -		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || -		       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || -		       fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || -		       fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); -		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; -		spin_unlock(&fs_info->super_lock); -		break; -	case BTRFS_EXCLOP_BALANCE: -		spin_lock(&fs_info->super_lock); -		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); -		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; -		spin_unlock(&fs_info->super_lock); -		break; -	default: -		btrfs_warn(fs_info, -			"invalid exclop balance operation %d requested", op); -	} -} -  static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)  {  	return put_user(inode->i_generation, arg); @@ -551,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,  	return ret;  } -int __pure btrfs_is_empty_uuid(const u8 *uuid) -{ -	int i; - -	for (i = 0; i < BTRFS_UUID_SIZE; i++) { -		if (uuid[i]) -			return 0; -	} -	return 1; -} -  /*   * Calculate the number of transaction items to reserve for creating a subvolume   * or snapshot, not including the inode, directory entries, or parent directory. @@ -3007,7 +2916,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);  	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	btrfs_release_path(path);  	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -5028,6 +4936,128 @@ out_acct:  	return ret;  } +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ +	loff_t pos; +	struct kiocb kiocb; +	struct file *file; +	ssize_t ret; +	void __user *sqe_addr; +	struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + +	if (!capable(CAP_SYS_ADMIN)) { +		ret = -EPERM; +		goto out_acct; +	} + +	file = cmd->file; +	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + +	if (!(file->f_mode & FMODE_WRITE)) { +		ret = -EBADF; +		goto out_acct; +	} + +	if (!data) { +		data = kzalloc(sizeof(*data), GFP_NOFS); +		if (!data) { +			ret = -ENOMEM; +			goto out_acct; +		} + +		io_uring_cmd_get_async_data(cmd)->op_data = data; + +		if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +			struct btrfs_ioctl_encoded_io_args_32 args32; + +			if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { +				ret = -EFAULT; +				goto out_acct; +			} +			data->args.iov = compat_ptr(args32.iov); +			data->args.iovcnt = args32.iovcnt; +			data->args.offset = args32.offset; +			data->args.flags = args32.flags; +			data->args.len = args32.len; +			data->args.unencoded_len = args32.unencoded_len; +			data->args.unencoded_offset = args32.unencoded_offset; +			data->args.compression = args32.compression; +			data->args.encryption = args32.encryption; +			memcpy(data->args.reserved, args32.reserved, +			       sizeof(data->args.reserved)); +#else +			ret = -ENOTTY; +			goto out_acct; +#endif +		} else { +			if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { +				ret = -EFAULT; +				goto out_acct; +			} +		} + +		ret = -EINVAL; +		if (data->args.flags != 0) +			goto out_acct; +		if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) +			goto out_acct; +		if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && +		    data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) +			goto out_acct; +		if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || +		    data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) +			goto out_acct; +		if (data->args.unencoded_offset > data->args.unencoded_len) +			goto out_acct; +		if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) +			goto out_acct; + +		data->iov = data->iovstack; +		ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, +				   ARRAY_SIZE(data->iovstack), &data->iov, +				   &data->iter); +		if (ret < 0) +			goto out_acct; + +		if (iov_iter_count(&data->iter) == 0) { +			ret = 0; +			goto out_iov; +		} +	} + +	if (issue_flags & IO_URING_F_NONBLOCK) { +		ret = -EAGAIN; +		goto out_acct; +	} + +	pos = data->args.offset; +	ret = rw_verify_area(WRITE, file, &pos, data->args.len); +	if (ret < 0) +		goto out_iov; + +	init_sync_kiocb(&kiocb, file); +	ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); +	if (ret) +		goto out_iov; +	kiocb.ki_pos = pos; + +	file_start_write(file); + +	ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); +	if (ret > 0) +		fsnotify_modify(file); + +	file_end_write(file); +out_iov: +	kfree(data->iov); +out_acct: +	if (ret > 0) +		add_wchar(current, ret); +	inc_syscw(current); +	return ret; +} +  int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)  {  	switch (cmd->cmd_op) { @@ -5036,6 +5066,12 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)  	case BTRFS_IOC_ENCODED_READ_32:  #endif  		return btrfs_uring_encoded_read(cmd, issue_flags); + +	case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +	case BTRFS_IOC_ENCODED_WRITE_32: +#endif +		return btrfs_uring_encoded_write(cmd, issue_flags);  	}  	return -EINVAL; @@ -5308,6 +5344,8 @@ long btrfs_ioctl(struct file *file, unsigned int  		return fsverity_ioctl_enable(file, (const void __user *)argp);  	case FS_IOC_MEASURE_VERITY:  		return fsverity_ioctl_measure(file, argp); +	case FS_IOC_READ_VERITY_METADATA: +		return fsverity_ioctl_read_metadata(file, argp);  	case BTRFS_IOC_ENCODED_READ:  		return btrfs_ioctl_encoded_read(file, argp, false);  	case BTRFS_IOC_ENCODED_WRITE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 2b760c8778f8..ce915fcda43b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -19,7 +19,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,  		       struct dentry *dentry, struct fileattr *fa);  int btrfs_ioctl_get_supported_features(void __user *arg);  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(const u8 *uuid);  void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,  				     struct btrfs_ioctl_balance_args *bargs);  int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 35036b151bf5..c69e57ff804b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)  {  	lockdep_assert_held_write(&eb->lock);  } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ +	lockdep_assert_held_read(&eb->lock); +}  #else  static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }  #endif  void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f9b214992212..b90fabe302e6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -673,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,  	key.offset = dst;  	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - -	btrfs_mark_buffer_dirty(trans, path->nodes[0]); -  	btrfs_free_path(path);  	return ret;  } @@ -752,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,  	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);  	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); -	btrfs_mark_buffer_dirty(trans, leaf); -  	btrfs_release_path(path);  	key.type = BTRFS_QGROUP_LIMIT_KEY; @@ -771,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,  	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);  	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); -	btrfs_mark_buffer_dirty(trans, leaf); -  	ret = 0;  out:  	btrfs_free_path(path); @@ -859,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,  	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);  	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);  	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - -	btrfs_mark_buffer_dirty(trans, l); -  out:  	btrfs_free_path(path);  	return ret; @@ -905,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,  	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);  	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);  	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - -	btrfs_mark_buffer_dirty(trans, l); -  out:  	btrfs_free_path(path);  	return ret; @@ -947,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)  	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);  	btrfs_set_qgroup_status_rescan(l, ptr,  				fs_info->qgroup_rescan_progress.objectid); - -	btrfs_mark_buffer_dirty(trans, l); -  out:  	btrfs_free_path(path);  	return ret; @@ -1130,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,  				      BTRFS_QGROUP_STATUS_FLAGS_MASK);  	btrfs_set_qgroup_status_rescan(leaf, ptr, 0); -	btrfs_mark_buffer_dirty(trans, leaf); -  	key.objectid = 0;  	key.type = BTRFS_ROOT_REF_KEY;  	key.offset = 0; @@ -1838,9 +1820,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)  	 * Thus its reserved space should all be zero, no matter if qgroup  	 * is consistent or the mode.  	 */ -	WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || -		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); +	if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || +	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || +	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { +		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); +		btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", +			      btrfs_qgroup_level(qgroup->qgroupid), +			      btrfs_qgroup_subvolid(qgroup->qgroupid), +			      qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], +			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], +			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + +	}  	/*  	 * The same for rfer/excl numbers, but that's only if our qgroup is  	 * consistent and if it's in regular qgroup mode. @@ -1849,8 +1841,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)  	 */  	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&  	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { -		if (WARN_ON(qgroup->rfer || qgroup->excl || -			    qgroup->rfer_cmpr || qgroup->excl_cmpr)) { +		if (qgroup->rfer || qgroup->excl || +		    qgroup->rfer_cmpr || qgroup->excl_cmpr) { +			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));  			btrfs_warn_rl(fs_info,  "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",  				      btrfs_qgroup_level(qgroup->qgroupid), diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9ffc79f250fb..1834011ccc49 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,12 +13,13 @@  #include "volumes.h"  #include "print-tree.h" -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,  					       struct btrfs_path *path,  					       const struct btrfs_key *oldkey,  					       u64 newlen, u64 frontpad)  { -	struct btrfs_stripe_extent *extent; +	struct btrfs_root *stripe_root = trans->fs_info->stripe_root; +	struct btrfs_stripe_extent *extent, *newitem;  	struct extent_buffer *leaf;  	int slot;  	size_t item_size; @@ -27,23 +28,39 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,  		.type = BTRFS_RAID_STRIPE_KEY,  		.offset = newlen,  	}; +	int ret; +	ASSERT(newlen > 0);  	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);  	leaf = path->nodes[0];  	slot = path->slots[0];  	item_size = btrfs_item_size(leaf, slot); + +	newitem = kzalloc(item_size, GFP_NOFS); +	if (!newitem) +		return -ENOMEM; +  	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);  	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {  		struct btrfs_raid_stride *stride = &extent->strides[i];  		u64 phys; -		phys = btrfs_raid_stride_physical(leaf, stride); -		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); +		phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; +		btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);  	} -	btrfs_set_item_key_safe(trans, path, &newkey); +	ret = btrfs_del_item(trans, stripe_root, path); +	if (ret) +		goto out; + +	btrfs_release_path(path); +	ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: +	kfree(newitem); +	return ret;  }  int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -59,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le  	int slot;  	int ret; -	if (!stripe_root) +	if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)  		return 0; +	if (!btrfs_is_testing(fs_info)) { +		struct btrfs_chunk_map *map; +		bool use_rst; + +		map = btrfs_find_chunk_map(fs_info, start, length); +		if (!map) +			return -EINVAL; +		use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); +		btrfs_free_chunk_map(map); +		if (!use_rst) +			return 0; +	} +  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -85,6 +115,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le  		found_end = found_start + key.offset;  		ret = 0; +		/* +		 * The stripe extent starts before the range we want to delete, +		 * but the range spans more than one stripe extent: +		 * +		 * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| +		 *        |--- keep  ---|--- drop ---| +		 * +		 * This means we have to get the previous item, truncate its +		 * length and then restart the search. +		 */ +		if (found_start > start) { +			if (slot == 0) { +				ret = btrfs_previous_item(stripe_root, path, start, +							  BTRFS_RAID_STRIPE_KEY); +				if (ret) { +					if (ret > 0) +						ret = -ENOENT; +					break; +				} +			} else { +				path->slots[0]--; +			} + +			leaf = path->nodes[0]; +			slot = path->slots[0]; +			btrfs_item_key_to_cpu(leaf, &key, slot); +			found_start = key.objectid; +			found_end = found_start + key.offset; +			ASSERT(found_start <= start); +		} +  		if (key.type != BTRFS_RAID_STRIPE_KEY)  			break; @@ -96,6 +157,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le  					       found_start, found_end);  		/* +		 * The stripe extent starts before the range we want to delete +		 * and ends after the range we want to delete, i.e. we're +		 * punching a hole in the stripe extent: +		 * +		 *  |--- RAID Stripe Extent ---| +		 *  | keep |--- drop ---| keep | +		 * +		 * This means we need to a) truncate the existing item and b) +		 * create a second item for the remaining range. +		 */ +		if (found_start < start && found_end > end) { +			size_t item_size; +			u64 diff_start = start - found_start; +			u64 diff_end = found_end - end; +			struct btrfs_stripe_extent *extent; +			struct btrfs_key newkey = { +				.objectid = end, +				.type = BTRFS_RAID_STRIPE_KEY, +				.offset = diff_end, +			}; + +			/* The "right" item. */ +			ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); +			if (ret) +				break; + +			item_size = btrfs_item_size(leaf, path->slots[0]); +			extent = btrfs_item_ptr(leaf, path->slots[0], +						struct btrfs_stripe_extent); + +			for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { +				struct btrfs_raid_stride *stride = &extent->strides[i]; +				u64 phys; + +				phys = btrfs_raid_stride_physical(leaf, stride); +				phys += diff_start + length; +				btrfs_set_raid_stride_physical(leaf, stride, phys); +			} + +			/* The "left" item. */ +			path->slots[0]--; +			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +			btrfs_partially_delete_raid_extent(trans, path, &key, +							   diff_start, 0); +			break; +		} + +		/*  		 * The stripe extent starts before the range we want to delete:  		 *  		 * |--- RAID Stripe Extent ---| @@ -105,11 +214,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le  		 * length to the new size and then re-insert the item.  		 */  		if (found_start < start) { -			u64 diff = start - found_start; +			u64 diff_start = start - found_start;  			btrfs_partially_delete_raid_extent(trans, path, &key, -							   diff, 0); -			break; +							   diff_start, 0); + +			start += (key.offset - diff_start); +			length -= (key.offset - diff_start); +			if (length == 0) +				break; + +			btrfs_release_path(path); +			continue;  		}  		/* @@ -122,13 +238,16 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le  		 * length to the new size and then re-insert the item.  		 */  		if (found_end > end) { -			u64 diff = found_end - end; +			u64 diff_end = found_end - end;  			btrfs_partially_delete_raid_extent(trans, path, &key, -							   diff, diff); +							   key.offset - length, +							   length); +			ASSERT(key.offset - diff_end == length);  			break;  		} +		/* Finally we can delete the whole item, no more special cases. */  		ret = btrfs_del_item(trans, stripe_root, path);  		if (ret)  			break; @@ -169,7 +288,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,  	write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),  			    item_size); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_free_path(path);  	return ret; @@ -199,12 +317,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,  	for (int i = 0; i < num_stripes; i++) {  		u64 devid = bioc->stripes[i].dev->devid;  		u64 physical = bioc->stripes[i].physical; -		u64 length = bioc->stripes[i].length;  		struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; -		if (length == 0) -			length = bioc->size; -  		btrfs_set_stack_raid_stride_devid(raid_stride, devid);  		btrfs_set_stack_raid_stride_physical(raid_stride, physical);  	} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index db8b42f674b7..af0969b70b53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,  		if (cur == node)  			ret = true; -		/* The node is the lowest node */ -		if (cur->lowest) { -			list_del_init(&cur->lower); -			cur->lowest = 0; -		} -  		/* Cleanup the lower edges */  		while (!list_empty(&cur->lower)) {  			struct btrfs_backref_edge *edge; @@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,  		 * cache to avoid unnecessary backref lookup.  		 */  		if (cur->level > 0) { -			list_add(&cur->list, &cache->detached);  			cur->detached = 1;  		} else {  			rb_erase(&cur->rb_node, &cache->rb_root); @@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(  		goto out;  	} -	node->lowest = 1;  	cur = node;  	/* Breadth-first search to build backref cache */ @@ -470,92 +462,6 @@ out:  }  /* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, -			      struct reloc_control *rc, -			      const struct btrfs_root *src, -			      struct btrfs_root *dest) -{ -	struct btrfs_root *reloc_root = src->reloc_root; -	struct btrfs_backref_cache *cache = &rc->backref_cache; -	struct btrfs_backref_node *node = NULL; -	struct btrfs_backref_node *new_node; -	struct btrfs_backref_edge *edge; -	struct btrfs_backref_edge *new_edge; -	struct rb_node *rb_node; - -	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); -	if (rb_node) { -		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); -		if (node->detached) -			node = NULL; -		else -			BUG_ON(node->new_bytenr != reloc_root->node->start); -	} - -	if (!node) { -		rb_node = rb_simple_search(&cache->rb_root, -					   reloc_root->commit_root->start); -		if (rb_node) { -			node = rb_entry(rb_node, struct btrfs_backref_node, -					rb_node); -			BUG_ON(node->detached); -		} -	} - -	if (!node) -		return 0; - -	new_node = btrfs_backref_alloc_node(cache, dest->node->start, -					    node->level); -	if (!new_node) -		return -ENOMEM; - -	new_node->lowest = node->lowest; -	new_node->checked = 1; -	new_node->root = btrfs_grab_root(dest); -	ASSERT(new_node->root); - -	if (!node->lowest) { -		list_for_each_entry(edge, &node->lower, list[UPPER]) { -			new_edge = btrfs_backref_alloc_edge(cache); -			if (!new_edge) -				goto fail; - -			btrfs_backref_link_edge(new_edge, edge->node[LOWER], -						new_node, LINK_UPPER); -		} -	} else { -		list_add_tail(&new_node->lower, &cache->leaves); -	} - -	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, -				   &new_node->rb_node); -	if (rb_node) -		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - -	if (!new_node->lowest) { -		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { -			list_add_tail(&new_edge->list[LOWER], -				      &new_edge->node[LOWER]->upper); -		} -	} -	return 0; -fail: -	while (!list_empty(&new_node->lower)) { -		new_edge = list_entry(new_node->lower.next, -				      struct btrfs_backref_edge, list[UPPER]); -		list_del(&new_edge->list[UPPER]); -		btrfs_backref_free_edge(cache, new_edge); -	} -	btrfs_backref_free_node(cache, new_node); -	return -ENOMEM; -} - -/*   * helper to add 'address of tree root -> reloc tree' mapping   */  static int __add_reloc_root(struct btrfs_root *root) @@ -950,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,  	u32 i;  	int ret = 0;  	int first = 1; -	int dirty = 0;  	if (rc->stage != UPDATE_DATA_PTRS)  		return 0; @@ -1030,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,  		}  		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); -		dirty = 1;  		key.offset -= btrfs_file_extent_offset(leaf, fi);  		ref.action = BTRFS_ADD_DELAYED_REF; @@ -1061,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,  			break;  		}  	} -	if (dirty) -		btrfs_mark_buffer_dirty(trans, leaf);  	if (inode)  		btrfs_add_delayed_iput(inode);  	return ret; @@ -1255,13 +1157,11 @@ again:  		 */  		btrfs_set_node_blockptr(parent, slot, new_bytenr);  		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); -		btrfs_mark_buffer_dirty(trans, parent);  		btrfs_set_node_blockptr(path->nodes[level],  					path->slots[level], old_bytenr);  		btrfs_set_node_ptr_generation(path->nodes[level],  					      path->slots[level], old_ptr_gen); -		btrfs_mark_buffer_dirty(trans, path->nodes[level]);  		ref.action = BTRFS_ADD_DELAYED_REF;  		ref.bytenr = old_bytenr; @@ -2058,100 +1958,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,  	int index = 0;  	int ret; -	next = node; -	while (1) { -		cond_resched(); -		next = walk_up_backref(next, edges, &index); -		root = next->root; +	next = walk_up_backref(node, edges, &index); +	root = next->root; -		/* -		 * If there is no root, then our references for this block are -		 * incomplete, as we should be able to walk all the way up to a -		 * block that is owned by a root. -		 * -		 * This path is only for SHAREABLE roots, so if we come upon a -		 * non-SHAREABLE root then we have backrefs that resolve -		 * improperly. -		 * -		 * Both of these cases indicate file system corruption, or a bug -		 * in the backref walking code. -		 */ -		if (!root) { -			ASSERT(0); -			btrfs_err(trans->fs_info, -		"bytenr %llu doesn't have a backref path ending in a root", -				  node->bytenr); -			return ERR_PTR(-EUCLEAN); -		} -		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { -			ASSERT(0); -			btrfs_err(trans->fs_info, -	"bytenr %llu has multiple refs with one ending in a non-shareable root", -				  node->bytenr); -			return ERR_PTR(-EUCLEAN); -		} - -		if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { -			ret = record_reloc_root_in_trans(trans, root); -			if (ret) -				return ERR_PTR(ret); -			break; -		} +	/* +	 * If there is no root, then our references for this block are +	 * incomplete, as we should be able to walk all the way up to a block +	 * that is owned by a root. +	 * +	 * This path is only for SHAREABLE roots, so if we come upon a +	 * non-SHAREABLE root then we have backrefs that resolve improperly. +	 * +	 * Both of these cases indicate file system corruption, or a bug in the +	 * backref walking code. +	 */ +	if (unlikely(!root)) { +		btrfs_err(trans->fs_info, +			  "bytenr %llu doesn't have a backref path ending in a root", +			  node->bytenr); +		return ERR_PTR(-EUCLEAN); +	} +	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { +		btrfs_err(trans->fs_info, +			  "bytenr %llu has multiple refs with one ending in a non-shareable root", +			  node->bytenr); +		return ERR_PTR(-EUCLEAN); +	} -		ret = btrfs_record_root_in_trans(trans, root); +	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { +		ret = record_reloc_root_in_trans(trans, root);  		if (ret)  			return ERR_PTR(ret); -		root = root->reloc_root; - -		/* -		 * We could have raced with another thread which failed, so -		 * root->reloc_root may not be set, return ENOENT in this case. -		 */ -		if (!root) -			return ERR_PTR(-ENOENT); +		goto found; +	} -		if (next->new_bytenr != root->node->start) { -			/* -			 * We just created the reloc root, so we shouldn't have -			 * ->new_bytenr set and this shouldn't be in the changed -			 *  list.  If it is then we have multiple roots pointing -			 *  at the same bytenr which indicates corruption, or -			 *  we've made a mistake in the backref walking code. -			 */ -			ASSERT(next->new_bytenr == 0); -			ASSERT(list_empty(&next->list)); -			if (next->new_bytenr || !list_empty(&next->list)) { -				btrfs_err(trans->fs_info, -	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", -					  node->bytenr, next->bytenr); -				return ERR_PTR(-EUCLEAN); -			} +	ret = btrfs_record_root_in_trans(trans, root); +	if (ret) +		return ERR_PTR(ret); +	root = root->reloc_root; -			next->new_bytenr = root->node->start; -			btrfs_put_root(next->root); -			next->root = btrfs_grab_root(root); -			ASSERT(next->root); -			list_add_tail(&next->list, -				      &rc->backref_cache.changed); -			mark_block_processed(rc, next); -			break; -		} +	/* +	 * We could have raced with another thread which failed, so +	 * root->reloc_root may not be set, return ENOENT in this case. +	 */ +	if (!root) +		return ERR_PTR(-ENOENT); -		WARN_ON(1); -		root = NULL; -		next = walk_down_backref(edges, &index); -		if (!next || next->level <= node->level) -			break; -	} -	if (!root) { +	if (next->new_bytenr) {  		/* -		 * This can happen if there's fs corruption or if there's a bug -		 * in the backref lookup code. +		 * We just created the reloc root, so we shouldn't have +		 * ->new_bytenr set yet. If it is then we have multiple roots +		 *  pointing at the same bytenr which indicates corruption, or +		 *  we've made a mistake in the backref walking code.  		 */ -		ASSERT(0); -		return ERR_PTR(-ENOENT); +		ASSERT(next->new_bytenr == 0); +		btrfs_err(trans->fs_info, +			  "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", +			  node->bytenr, next->bytenr); +		return ERR_PTR(-EUCLEAN);  	} +	next->new_bytenr = root->node->start; +	btrfs_put_root(next->root); +	next->root = btrfs_grab_root(root); +	ASSERT(next->root); +	mark_block_processed(rc, next); +found:  	next = node;  	/* setup backref node path for btrfs_reloc_cow_block */  	while (1) { @@ -2247,17 +2119,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,  	return num_bytes;  } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, -				  struct reloc_control *rc, -				  struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, +				 struct reloc_control *rc, u64 num_bytes)  { -	struct btrfs_root *root = rc->extent_root; -	struct btrfs_fs_info *fs_info = root->fs_info; -	u64 num_bytes; +	struct btrfs_fs_info *fs_info = trans->fs_info;  	int ret; -	u64 tmp; - -	num_bytes = calcu_metadata_size(rc, node) * 2;  	trans->block_rsv = rc->block_rsv;  	rc->reserved_bytes += num_bytes; @@ -2270,7 +2136,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,  	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,  				     BTRFS_RESERVE_FLUSH_LIMIT);  	if (ret) { -		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; +		u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; +  		while (tmp <= rc->reserved_bytes)  			tmp <<= 1;  		/* @@ -2288,6 +2155,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,  	return 0;  } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, +				  struct reloc_control *rc, +				  struct btrfs_backref_node *node) +{ +	u64 num_bytes; + +	num_bytes = calcu_metadata_size(rc, node) * 2; +	return refill_metadata_space(trans, rc, num_bytes); +} +  /*   * relocate a block tree, and then update pointers in upper level   * blocks that reference the block to point to the new location. @@ -2442,7 +2319,7 @@ next:  	if (!ret && node->pending) {  		btrfs_backref_drop_node_buffer(node); -		list_move_tail(&node->list, &rc->backref_cache.changed); +		list_del_init(&node->list);  		node->pending = 0;  	} @@ -2605,8 +2482,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  			/*  			 * This block was the root block of a root, and this is  			 * the first time we're processing the block and thus it -			 * should not have had the ->new_bytenr modified and -			 * should have not been included on the changed list. +			 * should not have had the ->new_bytenr modified.  			 *  			 * However in the case of corruption we could have  			 * multiple refs pointing to the same block improperly, @@ -2616,8 +2492,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  			 * normal user in the case of corruption.  			 */  			ASSERT(node->new_bytenr == 0); -			ASSERT(list_empty(&node->list)); -			if (node->new_bytenr || !list_empty(&node->list)) { +			if (node->new_bytenr) {  				btrfs_err(root->fs_info,  				  "bytenr %llu has improper references to it",  					  node->bytenr); @@ -2640,17 +2515,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  			btrfs_put_root(node->root);  			node->root = btrfs_grab_root(root);  			ASSERT(node->root); -			list_add_tail(&node->list, &rc->backref_cache.changed);  		} else { -			path->lowest_level = node->level; -			if (root == root->fs_info->chunk_root) -				btrfs_reserve_chunk_metadata(trans, false); -			ret = btrfs_search_slot(trans, root, key, path, 0, 1); -			btrfs_release_path(path); -			if (root == root->fs_info->chunk_root) -				btrfs_trans_release_chunk_metadata(trans); -			if (ret > 0) -				ret = 0; +			btrfs_err(root->fs_info, +				  "bytenr %llu resolved to a non-shareable root", +				  node->bytenr); +			ret = -EUCLEAN; +			goto out;  		}  		if (!ret)  			update_processed_blocks(rc, node); @@ -2658,11 +2528,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  		ret = do_relocation(trans, rc, node, key, path, 1);  	}  out: -	if (ret || node->level == 0 || node->cowonly) +	if (ret || node->level == 0)  		btrfs_backref_cleanup_node(&rc->backref_cache, node);  	return ret;  } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, +				  struct reloc_control *rc, struct tree_block *block, +				  struct btrfs_path *path) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_root *root; +	u64 num_bytes; +	int nr_levels; +	int ret; + +	root = btrfs_get_fs_root(fs_info, block->owner, true); +	if (IS_ERR(root)) +		return PTR_ERR(root); + +	nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + +	num_bytes = fs_info->nodesize * nr_levels; +	ret = refill_metadata_space(trans, rc, num_bytes); +	if (ret) { +		btrfs_put_root(root); +		return ret; +	} +	path->lowest_level = block->level; +	if (root == root->fs_info->chunk_root) +		btrfs_reserve_chunk_metadata(trans, false); + +	ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); +	path->lowest_level = 0; +	btrfs_release_path(path); + +	if (root == root->fs_info->chunk_root) +		btrfs_trans_release_chunk_metadata(trans); +	if (ret > 0) +		ret = 0; +	btrfs_put_root(root); + +	return ret; +} +  /*   * relocate a list of blocks   */ @@ -2702,6 +2611,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,  	/* Do tree relocation */  	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { +		/* +		 * For COWonly blocks, or the data reloc tree, we only need to +		 * COW down to the block, there's no need to generate a backref +		 * tree. +		 */ +		if (block->owner && +		    (!is_fstree(block->owner) || +		     block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { +			ret = relocate_cowonly_block(trans, rc, block, path); +			if (ret) +				break; +			continue; +		} +  		node = build_backref_tree(trans, rc, &block->key,  					  block->level, block->bytenr);  		if (IS_ERR(node)) { @@ -2947,7 +2870,7 @@ again:  	/*  	 * We could have lost folio private when we dropped the lock to read the -	 * folio above, make sure we set_page_extent_mapped here so we have any +	 * folio above, make sure we set_folio_extent_mapped() here so we have any  	 * of the subpage blocksize stuff we need in place.  	 */  	ret = set_folio_extent_mapped(folio); @@ -3799,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,  	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);  	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |  					  BTRFS_INODE_PREALLOC); -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	return ret; @@ -4405,8 +4327,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,  		WARN_ON(!first_cow && level == 0);  		node = rc->backref_cache.path[level]; -		BUG_ON(node->bytenr != buf->start && -		       node->new_bytenr != buf->start); + +		/* +		 * If node->bytenr != buf->start and node->new_bytenr != +		 * buf->start then we've got the wrong backref node for what we +		 * expected to see here and the cache is incorrect. +		 */ +		if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { +			btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", +				  buf->start, node->bytenr, node->new_bytenr); +			return -EUCLEAN; +		}  		btrfs_backref_drop_node_buffer(node);  		atomic_inc(&cow->refs); @@ -4506,10 +4438,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,  		return ret;  	}  	new_root->reloc_root = btrfs_grab_root(reloc_root); - -	if (rc->create_reloc_tree) -		ret = clone_backref_node(trans, rc, root, reloc_root); -	return ret; +	return 0;  }  /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 33962671a96c..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root  	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));  	write_extent_buffer(l, item, ptr, sizeof(*item)); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  out:  	btrfs_free_path(path);  	return ret; @@ -447,7 +446,6 @@ again:  	btrfs_set_root_ref_name_len(leaf, ref, name->len);  	ptr = (unsigned long)(ref + 1);  	write_extent_buffer(leaf, name->name, ptr, name->len); -	btrfs_mark_buffer_dirty(trans, leaf);  	if (key.type == BTRFS_ROOT_BACKREF_KEY) {  		btrfs_release_path(path); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 498c84323253..f437138fefbc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7259,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path,  		      enum btrfs_compare_tree_result result,  		      struct send_ctx *sctx)  { -	int ret = 0; +	int ret;  	/*  	 * We can not hold the commit root semaphore here. This is because in @@ -7319,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path,  			return 0;  		}  		result = BTRFS_COMPARE_TREE_CHANGED; -		ret = 0;  	}  	sctx->left_path = left_path; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 255e85f78313..a341d087567a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -14,6 +14,7 @@  #include "fs.h"  #include "accessors.h"  #include "extent-tree.h" +#include "zoned.h"  /*   * HOW DOES SPACE RESERVATION WORK @@ -127,6 +128,14 @@   *     churn a lot and we can avoid making some extent tree modifications if we   *     are able to delay for as long as possible.   * + *   RESET_ZONES + *     This state works only for the zoned mode. On the zoned mode, we cannot + *     reuse once allocated then freed region until we reset the zone, due to + *     the sequential write zone requirement. The RESET_ZONES state resets the + *     zones of an unused block group and let us reuse the space. The reusing + *     is faster than removing the block group and allocating another block + *     group on the zones. + *   *   ALLOC_CHUNK   *     We will skip this the first time through space reservation, because of   *     overcommit and we don't want to have a lot of useless metadata space when @@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,  	found->bytes_used += block_group->used;  	found->disk_used += block_group->used * factor;  	found->bytes_readonly += block_group->bytes_super; -	btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); +	btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);  	if (block_group->length > 0)  		found->full = 0;  	btrfs_try_granting_tickets(info, found); @@ -489,9 +498,7 @@ again:  		if ((used + ticket->bytes <= space_info->total_bytes) ||  		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,  					 flush)) { -			btrfs_space_info_update_bytes_may_use(fs_info, -							      space_info, -							      ticket->bytes); +			btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);  			remove_ticket(space_info, ticket);  			ticket->bytes = 0;  			space_info->tickets_id++; @@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,  		 */  		ret = btrfs_commit_current_transaction(root);  		break; +	case RESET_ZONES: +		ret = btrfs_reset_unused_block_groups(space_info, num_bytes); +		break;  	default:  		ret = -ENOSPC;  		break; @@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)  	enum btrfs_flush_state flush_state;  	int commit_cycles = 0;  	u64 last_tickets_id; +	enum btrfs_flush_state final_state;  	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);  	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); +	if (btrfs_is_zoned(fs_info)) +		final_state = RESET_ZONES; +	else +		final_state = COMMIT_TRANS;  	spin_lock(&space_info->lock);  	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)  		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)  			flush_state++; -		if (flush_state > COMMIT_TRANS) { +		if (flush_state > final_state) {  			commit_cycles++;  			if (commit_cycles > 2) {  				if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)  			}  		}  		spin_unlock(&space_info->lock); -	} while (flush_state <= COMMIT_TRANS); +	} while (flush_state <= final_state);  }  /* @@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)   *   This is where we reclaim all of the pinned space generated by running the   *   iputs   * + * RESET_ZONES + *   This state works only for the zoned mode. We scan the unused block group + *   list and reset the zones and reuse the block group. + *   * ALLOC_CHUNK_FORCE   *   For data we start with alloc chunk force, however we could have been full   *   before, and then the transaction commit could have freed new block groups, @@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = {  	FLUSH_DELALLOC_FULL,  	RUN_DELAYED_IPUTS,  	COMMIT_TRANS, +	RESET_ZONES,  	ALLOC_CHUNK_FORCE,  }; @@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)  static const enum btrfs_flush_state priority_flush_states[] = {  	FLUSH_DELAYED_ITEMS_NR,  	FLUSH_DELAYED_ITEMS, +	RESET_ZONES,  	ALLOC_CHUNK,  }; @@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {  	FLUSH_DELALLOC_FULL,  	ALLOC_CHUNK,  	COMMIT_TRANS, +	RESET_ZONES,  };  static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,  	if (!pending_tickets &&  	    ((used + orig_bytes <= space_info->total_bytes) ||  	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { -		btrfs_space_info_update_bytes_may_use(fs_info, space_info, -						      orig_bytes); +		btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);  		ret = 0;  	} @@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,  	if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {  		used = btrfs_space_info_used(space_info, false);  		if (used + orig_bytes <= space_info->total_bytes) { -			btrfs_space_info_update_bytes_may_use(fs_info, space_info, -							      orig_bytes); +			btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);  			ret = 0;  		}  	} @@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)  			do_reclaim_sweep(space_info, raid);  	}  } + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ +	struct btrfs_fs_info *fs_info = space_info->fs_info; +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + +	lockdep_assert_held(&space_info->lock); + +	/* Prioritize the global reservation to receive the freed space. */ +	if (global_rsv->space_info != space_info) +		goto grant; + +	spin_lock(&global_rsv->lock); +	if (!global_rsv->full) { +		u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + +		global_rsv->reserved += to_add; +		btrfs_space_info_update_bytes_may_use(space_info, to_add); +		if (global_rsv->reserved >= global_rsv->size) +			global_rsv->full = 1; +		len -= to_add; +	} +	spin_unlock(&global_rsv->lock); + +grant: +	/* Add to any tickets we may have. */ +	if (len) +		btrfs_try_granting_tickets(fs_info, space_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index efbecc0c5258..a96efdb5e681 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {  	BTRFS_RESERVE_FLUSH_EMERGENCY,  }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */  enum btrfs_flush_state {  	FLUSH_DELAYED_ITEMS_NR	= 1,  	FLUSH_DELAYED_ITEMS	= 2, @@ -91,6 +95,7 @@ enum btrfs_flush_state {  	ALLOC_CHUNK_FORCE	= 9,  	RUN_DELAYED_IPUTS	= 10,  	COMMIT_TRANS		= 11, +	RESET_ZONES		= 12,  };  struct btrfs_space_info { @@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i   */  #define DECLARE_SPACE_INFO_UPDATE(name, trace_name)			\  static inline void							\ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info,		\ -			       struct btrfs_space_info *sinfo,		\ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo,		\  			       s64 bytes)				\  {									\ +	struct btrfs_fs_info *fs_info = sinfo->fs_info;			\  	const u64 abs_bytes = (bytes < 0) ? -bytes : bytes;		\  	lockdep_assert_held(&sinfo->lock);				\  	trace_update_##name(fs_info, sinfo, sinfo->name, bytes);	\ @@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,  			 enum btrfs_reserve_flush_enum flush);  static inline void btrfs_space_info_free_bytes_may_use( -				struct btrfs_fs_info *fs_info,  				struct btrfs_space_info *space_info,  				u64 num_bytes)  {  	spin_lock(&space_info->lock); -	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); -	btrfs_try_granting_tickets(fs_info, space_info); +	btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); +	btrfs_try_granting_tickets(space_info->fs_info, space_info);  	spin_unlock(&space_info->lock);  }  int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, @@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool  bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);  int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);  void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);  #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8c68059ac1b0..722acf768396 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,  IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,  			 folio_test_checked); +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\ +{									\ +	const int sectors_per_page = fs_info->sectors_per_page;		\ +									\ +	ASSERT(sectors_per_page < BITS_PER_LONG);			\ +	*dst = bitmap_read(subpage->bitmaps,				\ +			   sectors_per_page * btrfs_bitmap_nr_##name,	\ +			   sectors_per_page);				\ +} + +#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len)		\ +{									\ +	const struct btrfs_subpage *subpage = folio_get_private(folio);	\ +	unsigned long bitmap;						\ +									\ +	GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap);		\ +	btrfs_warn(fs_info,						\ +	"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ +		   start, len, folio_pos(folio),			\ +		   fs_info->sectors_per_page, &bitmap);			\ +} +  /*   * Make sure not only the page dirty bit is cleared, but also subpage dirty bit   * is cleared. @@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,  	subpage = folio_get_private(folio);  	ASSERT(subpage);  	spin_lock_irqsave(&subpage->lock, flags); +	if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { +		SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); +		ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); +	}  	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));  	spin_unlock_irqrestore(&subpage->lock, flags);  } @@ -689,23 +715,16 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,  	nbits = len >> fs_info->sectorsize_bits;  	spin_lock_irqsave(&subpage->lock, flags);  	/* Target range should not yet be locked. */ -	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); +	if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { +		SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); +		ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); +	}  	bitmap_set(subpage->bitmaps, start_bit, nbits);  	ret = atomic_add_return(nbits, &subpage->nr_locked);  	ASSERT(ret <= fs_info->sectors_per_page);  	spin_unlock_irqrestore(&subpage->lock, flags);  } -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\ -{									\ -	const int sectors_per_page = fs_info->sectors_per_page;		\ -									\ -	ASSERT(sectors_per_page < BITS_PER_LONG);			\ -	*dst = bitmap_read(subpage->bitmaps,				\ -			   sectors_per_page * btrfs_bitmap_nr_##name,	\ -			   sectors_per_page);				\ -} -  void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,  				      struct folio *folio, u64 start, u32 len)  { @@ -716,6 +735,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,  	unsigned long writeback_bitmap;  	unsigned long ordered_bitmap;  	unsigned long checked_bitmap; +	unsigned long locked_bitmap;  	unsigned long flags;  	ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -728,15 +748,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,  	GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);  	GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);  	GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); -	GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); +	GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);  	spin_unlock_irqrestore(&subpage->lock, flags);  	dump_page(folio_page(folio, 0), "btrfs subpage dump");  	btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",  		    start, len, folio_pos(folio),  		    sectors_per_page, &uptodate_bitmap,  		    sectors_per_page, &dirty_bitmap, +		    sectors_per_page, &locked_bitmap,  		    sectors_per_page, &writeback_bitmap,  		    sectors_per_page, &ordered_bitmap,  		    sectors_per_page, &checked_bitmap); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 428fa9389fd4..44fff1f4eac4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -137,6 +137,19 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback);  DECLARE_BTRFS_SUBPAGE_OPS(ordered);  DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, +					       struct folio *locked_folio, +					       u64 start, u32 len) +{ +	btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); +	btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); +	btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} +  bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,  					struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7dfe5005129a..f809c3200c21 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb,  	err = open_ctree(sb, fs_devices);  	if (err) { -		btrfs_err(fs_info, "open_ctree failed"); +		btrfs_err(fs_info, "open_ctree failed: %d", err);  		return err;  	} @@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void)  static int __init btrfs_print_mod_info(void)  {  	static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL +			", experimental=on" +#endif  #ifdef CONFIG_BTRFS_DEBUG  			", debug=on"  #endif @@ -2466,7 +2469,17 @@ static int __init btrfs_print_mod_info(void)  			", fsverity=no"  #endif  			; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	if (btrfs_get_mod_read_policy() == NULL) +		pr_info("Btrfs loaded%s\n", options); +	else +		pr_info("Btrfs loaded%s, read_policy=%s\n", +			 options, btrfs_get_mod_read_policy()); +#else  	pr_info("Btrfs loaded%s\n", options); +#endif +  	return 0;  } @@ -2524,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = {  	}, {  		.init_func = extent_map_init,  		.exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	}, { +		.init_func = btrfs_read_policy_init, +		.exit_func = NULL, +#endif  	}, {  		.init_func = ordered_data_init,  		.exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7f09b6c9cc2d..53b846d99ece 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1305,7 +1305,73 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,  }  BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { +	"pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	"round-robin", +	"devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ +	return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +{ +	char param[32] = { 0 }; +	char __maybe_unused *value_str; + +	if (!str || strlen(str) == 0) +		return 0; + +	strncpy(param, str, sizeof(param) - 1); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	/* Separate value from input in policy:value format. */ +	value_str = strchr(param, ':'); +	if (value_str) { +		int ret; + +		*value_str = 0; +		value_str++; +		if (!value_ret) +			return -EINVAL; +		ret = kstrtos64(value_str, 10, value_ret); +		if (ret) +			return -EINVAL; +		if (*value_ret < 0) +			return -ERANGE; +	} +#endif + +	return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ +	s64 value; + +	if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { +		btrfs_err(NULL, "invalid read policy or value %s", read_policy); +		return -EINVAL; +	} + +	return 0; +} +#endif  static ssize_t btrfs_read_policy_show(struct kobject *kobj,  				      struct kobj_attribute *a, char *buf) @@ -1316,14 +1382,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,  	int i;  	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { -		if (policy == i) -			ret += sysfs_emit_at(buf, ret, "%s[%s]", -					 (ret == 0 ? "" : " "), -					 btrfs_read_policy_name[i]); -		else -			ret += sysfs_emit_at(buf, ret, "%s%s", -					 (ret == 0 ? "" : " "), -					 btrfs_read_policy_name[i]); +		if (ret != 0) +			ret += sysfs_emit_at(buf, ret, " "); + +		if (i == policy) +			ret += sysfs_emit_at(buf, ret, "["); + +		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +		if (i == BTRFS_READ_POLICY_RR) +			ret += sysfs_emit_at(buf, ret, ":%u", +					     READ_ONCE(fs_devices->rr_min_contig_read)); + +		if (i == BTRFS_READ_POLICY_DEVID) +			ret += sysfs_emit_at(buf, ret, ":%llu", +					     READ_ONCE(fs_devices->read_devid)); +#endif +		if (i == policy) +			ret += sysfs_emit_at(buf, ret, "]");  	}  	ret += sysfs_emit_at(buf, ret, "\n"); @@ -1336,21 +1413,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,  				       const char *buf, size_t len)  {  	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); -	int i; +	int index; +	s64 value = -1; -	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { -		if (sysfs_streq(buf, btrfs_read_policy_name[i])) { -			if (i != READ_ONCE(fs_devices->read_policy)) { -				WRITE_ONCE(fs_devices->read_policy, i); -				btrfs_info(fs_devices->fs_info, -					   "read policy set to '%s'", -					   btrfs_read_policy_name[i]); +	index = btrfs_read_policy_to_enum(buf, &value); +	if (index < 0) +		return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	/* If moving from RR then disable collecting fs stats. */ +	if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) +		fs_devices->collect_fs_stats = false; + +	if (index == BTRFS_READ_POLICY_RR) { +		if (value != -1) { +			const u32 sectorsize = fs_devices->fs_info->sectorsize; + +			if (!IS_ALIGNED(value, sectorsize)) { +				u64 temp_value = round_up(value, sectorsize); + +				btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", +					  value, sectorsize, temp_value); +				value = temp_value;  			} -			return len; +		} else { +			value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +		} + +		if (index != READ_ONCE(fs_devices->read_policy) || +		    value != READ_ONCE(fs_devices->rr_min_contig_read)) { +			WRITE_ONCE(fs_devices->read_policy, index); +			WRITE_ONCE(fs_devices->rr_min_contig_read, value); + +			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", +				   btrfs_read_policy_name[index], value); +		} + +		fs_devices->collect_fs_stats = true; + +		return len; +	} + +	if (index == BTRFS_READ_POLICY_DEVID) { +		if (value != -1) { +			BTRFS_DEV_LOOKUP_ARGS(args); + +			/* Validate input devid. */ +			args.devid = value; +			if (btrfs_find_device(fs_devices, &args) == NULL) +				return -EINVAL; +		} else { +			/* Set default devid to the devid of the latest device. */ +			value = fs_devices->latest_dev->devid;  		} + +		if (index != READ_ONCE(fs_devices->read_policy) || +		    value != READ_ONCE(fs_devices->read_devid)) { +			WRITE_ONCE(fs_devices->read_policy, index); +			WRITE_ONCE(fs_devices->read_devid, value); + +			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", +				   btrfs_read_policy_name[index], value); +		} + +		return len; +	} +#endif +	if (index != READ_ONCE(fs_devices->read_policy)) { +		WRITE_ONCE(fs_devices->read_policy, index); +		btrfs_info(fs_devices->fs_info, "read policy set to '%s'", +			   btrfs_read_policy_name[index]);  	} -	return -EINVAL; +	return len;  }  BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);  int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);  void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,  				struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif  #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e607b5d52fb1..5eff8d7d2360 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -30,6 +30,7 @@ const char *test_error[] = {  	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map",  	[TEST_ALLOC_CHUNK_MAP]       = "cannot allocate chunk map",  	[TEST_ALLOC_IO_CONTEXT]	     = "cannot allocate io context", +	[TEST_ALLOC_TRANSACTION]     = "cannot allocate transaction",  };  static const struct super_operations btrfs_test_super_ops = { @@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)  	fs_info->nodesize = nodesize;  	fs_info->sectorsize = sectorsize;  	fs_info->sectorsize_bits = ilog2(sectorsize); + +	/* CRC32C csum size. */ +	fs_info->csum_size = 4; +	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / +		fs_info->csum_size;  	set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);  	test_mnt->mnt_sb->s_fs_info = fs_info; @@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)  	kfree(cache);  } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ +	memset(trans, 0, sizeof(*trans)); +	trans->fs_info = fs_info; +	xa_init(&trans->delayed_refs.head_refs); +	xa_init(&trans->delayed_refs.dirty_extents); +	spin_lock_init(&trans->delayed_refs.lock); +} +  void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,  			    struct btrfs_fs_info *fs_info)  { @@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void)  			ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);  			if (ret)  				goto out; +			ret = btrfs_test_delayed_refs(sectorsize, nodesize); +			if (ret) +				goto out;  		}  	}  	ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b524ecf2f452..4307bdaa6749 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@  #ifndef BTRFS_TESTS_H  #define BTRFS_TESTS_H +#include <linux/types.h> +  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  int btrfs_run_sanity_tests(void); @@ -25,12 +27,14 @@ enum {  	TEST_ALLOC_EXTENT_MAP,  	TEST_ALLOC_CHUNK_MAP,  	TEST_ALLOC_IO_CONTEXT, +	TEST_ALLOC_TRANSACTION,  };  extern const char *test_error[];  struct btrfs_root;  struct btrfs_trans_handle; +struct btrfs_transaction;  int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);  int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);  int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);  int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);  int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);  struct inode *btrfs_new_test_inode(void);  struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);  void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt  void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);  void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,  			    struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);  struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);  #else  static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 000000000000..6558508c2ddf --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1015 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { +	u64 bytenr; +	u64 num_bytes; +	int ref_mod; +	int total_ref_mod; +	int must_insert; +}; + +struct ref_node_check { +	u64 bytenr; +	u64 num_bytes; +	int ref_mod; +	enum btrfs_delayed_ref_action action; +	u8 type; +	u64 parent; +	u64 root; +	u64 owner; +	u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ +	if ((type == BTRFS_TREE_BLOCK_REF_KEY) || +	    (type == BTRFS_SHARED_BLOCK_REF_KEY)) +		return BTRFS_REF_METADATA; +	return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, +				    struct btrfs_delayed_ref_head *head) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_delayed_ref_root *delayed_refs = +		&trans->transaction->delayed_refs; + +	spin_lock(&delayed_refs->lock); +	spin_lock(&head->lock); +	btrfs_delete_ref_head(fs_info, delayed_refs, head); +	spin_unlock(&head->lock); +	spin_unlock(&delayed_refs->lock); + +	btrfs_delayed_ref_unlock(head); +	btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, +				    struct btrfs_delayed_ref_node *node) +{ +	rb_erase_cached(&node->ref_node, &head->ref_tree); +	RB_CLEAR_NODE(&node->ref_node); +	if (!list_empty(&node->add_list)) +		list_del_init(&node->add_list); +	btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, +			     struct ref_head_check *check) +{ +	if (head->bytenr != check->bytenr) { +		test_err("invalid bytenr have: %llu want: %llu", head->bytenr, +			 check->bytenr); +		return -EINVAL; +	} + +	if (head->num_bytes != check->num_bytes) { +		test_err("invalid num_bytes have: %llu want: %llu", +			 head->num_bytes, check->num_bytes); +		return -EINVAL; +	} + +	if (head->ref_mod != check->ref_mod) { +		test_err("invalid ref_mod have: %d want: %d", head->ref_mod, +			 check->ref_mod); +		return -EINVAL; +	} + +	if (head->total_ref_mod != check->total_ref_mod) { +		test_err("invalid total_ref_mod have: %d want: %d", +			 head->total_ref_mod, check->total_ref_mod); +		return -EINVAL; +	} + +	if (head->must_insert_reserved != check->must_insert) { +		test_err("invalid must_insert have: %d want: %d", +			 head->must_insert_reserved, check->must_insert); +		return -EINVAL; +	} + +	return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, +			     struct ref_node_check *check) +{ +	if (node->bytenr != check->bytenr) { +		test_err("invalid bytenr have: %llu want: %llu", node->bytenr, +			 check->bytenr); +		return -EINVAL; +	} + +	if (node->num_bytes != check->num_bytes) { +		test_err("invalid num_bytes have: %llu want: %llu", +			 node->num_bytes, check->num_bytes); +		return -EINVAL; +	} + +	if (node->ref_mod != check->ref_mod) { +		test_err("invalid ref_mod have: %d want: %d", node->ref_mod, +			 check->ref_mod); +		return -EINVAL; +	} + +	if (node->action != check->action) { +		test_err("invalid action have: %d want: %d", node->action, +			 check->action); +		return -EINVAL; +	} + +	if (node->parent != check->parent) { +		test_err("invalid parent have: %llu want: %llu", node->parent, +			 check->parent); +		return -EINVAL; +	} + +	if (node->ref_root != check->root) { +		test_err("invalid root have: %llu want: %llu", node->ref_root, +			 check->root); +		return -EINVAL; +	} + +	if (node->type != check->type) { +		test_err("invalid type have: %d want: %d", node->type, +			 check->type); +		return -EINVAL; +	} + +	if (btrfs_delayed_ref_owner(node) != check->owner) { +		test_err("invalid owner have: %llu want: %llu", +			 btrfs_delayed_ref_owner(node), check->owner); +		return -EINVAL; +	} + +	if (btrfs_delayed_ref_offset(node) != check->offset) { +		test_err("invalid offset have: %llu want: %llu", +			 btrfs_delayed_ref_offset(node), check->offset); +		return -EINVAL; +	} + +	return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, +		       struct ref_head_check *head_check, +		       struct ref_node_check *node_check) +{ +	struct btrfs_delayed_ref_root *delayed_refs = +		&trans->transaction->delayed_refs; +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_delayed_ref_head *head; +	struct btrfs_delayed_ref_node *node; +	struct btrfs_ref ref = { +		.type = ref_type_from_disk_ref_type(node_check->type), +		.action = node_check->action, +		.parent = node_check->parent, +		.ref_root = node_check->root, +		.bytenr = node_check->bytenr, +		.num_bytes = fs_info->nodesize, +	}; +	int ret; + +	if (ref.type == BTRFS_REF_METADATA) +		btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, +				    false); +	else +		btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, +				    node_check->root, true); + +	if (ref.type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		return ret; +	} + +	head = btrfs_select_ref_head(fs_info, delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		return -EINVAL; +	} + +	ret = -EINVAL; +	if (validate_ref_head(head, head_check)) +		goto out; + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	if (validate_ref_node(node, node_check)) +		goto out; +	ret = 0; +out: +	btrfs_unselect_ref_head(delayed_refs, head); +	btrfs_destroy_delayed_refs(trans->transaction); +	return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct ref_head_check head_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 1, +		.total_ref_mod = 1, +	}; +	struct ref_node_check node_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 1, +		.action = BTRFS_ADD_DELAYED_REF, +		.type = BTRFS_TREE_BLOCK_REF_KEY, +		.parent = 0, +		.root = FAKE_ROOT_OBJECTID, +		.owner = FAKE_LEVEL, +		.offset = 0, +	}; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single add tree block failed"); +		return -EINVAL; +	} + +	node_check.type = BTRFS_EXTENT_DATA_REF_KEY; +	node_check.owner = FAKE_INO; +	node_check.offset = FAKE_FILE_OFFSET; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single add extent data failed"); +		return -EINVAL; +	} + +	node_check.parent = FAKE_PARENT; +	node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; +	node_check.owner = FAKE_LEVEL; +	node_check.offset = 0; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single add shared block failed"); +		return -EINVAL; +	} + +	node_check.type = BTRFS_SHARED_DATA_REF_KEY; +	node_check.owner = FAKE_INO; +	node_check.offset = FAKE_FILE_OFFSET; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single add shared data failed"); +		return -EINVAL; +	} + +	head_check.ref_mod = -1; +	head_check.total_ref_mod = -1; +	node_check.action = BTRFS_DROP_DELAYED_REF; +	node_check.type = BTRFS_TREE_BLOCK_REF_KEY; +	node_check.owner = FAKE_LEVEL; +	node_check.offset = 0; +	node_check.parent = 0; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single drop tree block failed"); +		return -EINVAL; +	} + +	node_check.type = BTRFS_EXTENT_DATA_REF_KEY; +	node_check.owner = FAKE_INO; +	node_check.offset = FAKE_FILE_OFFSET; + +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single drop extent data failed"); +		return -EINVAL; +	} + +	node_check.parent = FAKE_PARENT; +	node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; +	node_check.owner = FAKE_LEVEL; +	node_check.offset = 0; +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single drop shared block failed"); +		return -EINVAL; +	} + +	node_check.type = BTRFS_SHARED_DATA_REF_KEY; +	node_check.owner = FAKE_INO; +	node_check.offset = FAKE_FILE_OFFSET; +	if (simple_test(trans, &head_check, &node_check)) { +		test_err("single drop shared data failed"); +		return -EINVAL; +	} + +	return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, +		       enum btrfs_ref_type type) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_delayed_ref_head *head = NULL; +	struct btrfs_delayed_ref_node *node; +	struct btrfs_ref ref = { +		.type = type, +		.action = BTRFS_ADD_DELAYED_REF, +		.parent = 0, +		.ref_root = FAKE_ROOT_OBJECTID, +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +	}; +	struct ref_head_check head_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 0, +		.total_ref_mod = 0, +	}; +	struct ref_node_check node_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 2, +		.action = BTRFS_ADD_DELAYED_REF, +		.parent = 0, +		.root = FAKE_ROOT_OBJECTID, +	}; +	int ret; + +	/* +	 * First add a ref and then drop it, make sure we get a head ref with a +	 * 0 total ref mod and no nodes. +	 */ +	if (type == BTRFS_REF_METADATA) { +		node_check.type = BTRFS_TREE_BLOCK_REF_KEY; +		node_check.owner = FAKE_LEVEL; +		btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); +	} else { +		node_check.type = BTRFS_EXTENT_DATA_REF_KEY; +		node_check.owner = FAKE_INO; +		node_check.offset = FAKE_FILE_OFFSET; +		btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, +				    FAKE_ROOT_OBJECTID, true); +	} + +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		return ret; +	} + +	ref.action = BTRFS_DROP_DELAYED_REF; +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		goto out; +	} + +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("single add and drop failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} + +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* +	 * Add a ref, then add another ref, make sure we get a head ref with a +	 * 2 total ref mod and 1 node. +	 */ +	ref.action = BTRFS_ADD_DELAYED_REF; +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		goto out; +	} + +	head_check.ref_mod = 2; +	head_check.total_ref_mod = 2; +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("double add failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} + +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* Add two drop refs, make sure they are merged properly. */ +	ref.action = BTRFS_DROP_DELAYED_REF; +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	if (type == BTRFS_REF_METADATA) +		ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	else +		ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		goto out; +	} + +	head_check.ref_mod = -2; +	head_check.total_ref_mod = -2; +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("double drop failed"); +		goto out; +	} + +	node_check.action = BTRFS_DROP_DELAYED_REF; +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} + +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* Add multiple refs, then drop until we go negative again. */ +	ref.action = BTRFS_ADD_DELAYED_REF; +	for (int i = 0; i < 10; i++) { +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	ref.action = BTRFS_DROP_DELAYED_REF; +	for (int i = 0; i < 12; i++) { +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		ret = -EINVAL; +		goto out; +	} + +	head_check.ref_mod = -2; +	head_check.total_ref_mod = -2; +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("double drop failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} + +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} + +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* Drop multiple refs, then add until we go positive again. */ +	ref.action = BTRFS_DROP_DELAYED_REF; +	for (int i = 0; i < 10; i++) { +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	ref.action = BTRFS_ADD_DELAYED_REF; +	for (int i = 0; i < 12; i++) { +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		ret = -EINVAL; +		goto out; +	} + +	head_check.ref_mod = 2; +	head_check.total_ref_mod = 2; +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("add and drop to positive failed"); +		goto out; +	} + +	node_check.action = BTRFS_ADD_DELAYED_REF; +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} + +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* +	 * Add a bunch of refs with different roots and parents, then drop them +	 * all, make sure everything is properly merged. +	 */ +	ref.action = BTRFS_ADD_DELAYED_REF; +	for (int i = 0; i < 50; i++) { +		if (!(i % 2)) { +			ref.parent = 0; +			ref.ref_root = FAKE_ROOT_OBJECTID + i; +		} else { +			ref.parent = FAKE_PARENT + (i * fs_info->nodesize); +		} +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	ref.action = BTRFS_DROP_DELAYED_REF; +	for (int i = 0; i < 50; i++) { +		if (!(i % 2)) { +			ref.parent = 0; +			ref.ref_root = FAKE_ROOT_OBJECTID + i; +		} else { +			ref.parent = FAKE_PARENT + (i * fs_info->nodesize); +		} +		if (type == BTRFS_REF_METADATA) +			ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +		else +			ret = btrfs_add_delayed_data_ref(trans, &ref, 0); +		if (ret) { +			test_err("failed ref action %d", ret); +			goto out; +		} +	} + +	head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		ret = -EINVAL; +		goto out; +	} + +	head_check.ref_mod = 0; +	head_check.total_ref_mod = 0; +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("add and drop multiple failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (node) { +		test_err("found node when none should exist"); +		goto out; +	} +	ret = 0; +out: +	if (!IS_ERR_OR_NULL(head)) +		btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); +	btrfs_destroy_delayed_refs(trans->transaction); +	return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ +	struct btrfs_delayed_ref_root *delayed_refs = +		&trans->transaction->delayed_refs; +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_delayed_ref_head *head = NULL; +	struct btrfs_delayed_ref_node *node; +	struct btrfs_ref ref = { +		.type = BTRFS_REF_METADATA, +		.action = BTRFS_DROP_DELAYED_REF, +		.parent = 0, +		.ref_root = FAKE_ROOT_OBJECTID, +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +	}; +	struct ref_head_check head_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 0, +		.total_ref_mod = 0, +	}; +	struct ref_node_check node_check = { +		.bytenr = FAKE_BYTENR, +		.num_bytes = fs_info->nodesize, +		.ref_mod = 1, +		.action = BTRFS_ADD_DELAYED_REF, +		.type = BTRFS_TREE_BLOCK_REF_KEY, +		.parent = 0, +		.owner = FAKE_LEVEL, +		.offset = 0, +	}; +	int ret; + +	/* Add the drop first. */ +	btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		return ret; +	} + +	/* +	 * Now add the add, and make it a different root so it's logically later +	 * in the rb tree. +	 */ +	ref.action = BTRFS_ADD_DELAYED_REF; +	ref.ref_root = FAKE_ROOT_OBJECTID + 1; +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	head = btrfs_select_ref_head(fs_info, delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		ret = -EINVAL; +		head = NULL; +		goto out; +	} + +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("head check failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	node_check.root = FAKE_ROOT_OBJECTID + 1; +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	node_check.action = BTRFS_DROP_DELAYED_REF; +	node_check.root = FAKE_ROOT_OBJECTID; +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} +	delete_delayed_ref_node(head, node); +	delete_delayed_ref_head(trans, head); +	head = NULL; + +	/* +	 * Now we're going to do the same thing, but we're going to have an add +	 * that gets deleted because of a merge, and make sure we still have +	 * another add in place. +	 */ +	ref.action = BTRFS_DROP_DELAYED_REF; +	ref.ref_root = FAKE_ROOT_OBJECTID; +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	ref.action = BTRFS_ADD_DELAYED_REF; +	ref.ref_root = FAKE_ROOT_OBJECTID + 1; +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	ref.action = BTRFS_DROP_DELAYED_REF; +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	ref.action = BTRFS_ADD_DELAYED_REF; +	ref.ref_root = FAKE_ROOT_OBJECTID + 2; +	ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); +	if (ret) { +		test_err("failed ref action %d", ret); +		goto out; +	} + +	head = btrfs_select_ref_head(fs_info, delayed_refs); +	if (IS_ERR_OR_NULL(head)) { +		if (IS_ERR(head)) +			test_err("failed to select delayed ref head: %ld", +				 PTR_ERR(head)); +		else +			test_err("failed to find delayed ref head"); +		ret = -EINVAL; +		head = NULL; +		goto out; +	} + +	ret = -EINVAL; +	if (validate_ref_head(head, &head_check)) { +		test_err("head check failed"); +		goto out; +	} + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	node_check.action = BTRFS_ADD_DELAYED_REF; +	node_check.root = FAKE_ROOT_OBJECTID + 2; +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} +	delete_delayed_ref_node(head, node); + +	spin_lock(&head->lock); +	node = btrfs_select_delayed_ref(head); +	spin_unlock(&head->lock); +	if (!node) { +		test_err("failed to select delayed ref"); +		goto out; +	} + +	node_check.action = BTRFS_DROP_DELAYED_REF; +	node_check.root = FAKE_ROOT_OBJECTID; +	if (validate_ref_node(node, &node_check)) { +		test_err("node check failed"); +		goto out; +	} +	delete_delayed_ref_node(head, node); +	ret = 0; +out: +	if (head) +		btrfs_unselect_ref_head(delayed_refs, head); +	btrfs_destroy_delayed_refs(trans->transaction); +	return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ +	struct btrfs_transaction *transaction; +	struct btrfs_trans_handle trans; +	struct btrfs_fs_info *fs_info; +	int ret; + +	test_msg("running delayed refs tests"); + +	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); +	if (!fs_info) { +		test_std_err(TEST_ALLOC_FS_INFO); +		return -ENOMEM; +	} +	transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); +	if (!transaction) { +		test_std_err(TEST_ALLOC_TRANSACTION); +		ret = -ENOMEM; +		goto out_free_fs_info; +	} +	btrfs_init_dummy_trans(&trans, fs_info); +	btrfs_init_dummy_transaction(transaction, fs_info); +	trans.transaction = transaction; + +	ret = simple_tests(&trans); +	if (!ret) { +		test_msg("running delayed refs merg tests on metadata refs"); +		ret = merge_tests(&trans, BTRFS_REF_METADATA); +	} + +	if (!ret) { +		test_msg("running delayed refs merg tests on data refs"); +		ret = merge_tests(&trans, BTRFS_REF_DATA); +	} + +	if (!ret) +		ret = select_delayed_refs_test(&trans); + +out_free_fs_info: +	btrfs_free_dummy_fs_info(fs_info); +	return ret; +} diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 30f17eb7b6a8..a7bc58a5c1e2 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -14,6 +14,8 @@  #define RST_TEST_NUM_DEVICES	(2)  #define RST_TEST_RAID1_TYPE	(BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) +#define SZ_48K (SZ_32K + SZ_16K) +  typedef int (*test_func_t)(struct btrfs_trans_handle *trans);  static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, @@ -30,6 +32,613 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de  }  /* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_io_context *bioc; +	struct btrfs_io_stripe io_stripe = { 0 }; +	u64 map_type = RST_TEST_RAID1_TYPE; +	u64 logical1 = SZ_1M; +	u64 len1 = SZ_1M; +	u64 logical2 = logical1 + len1; +	u64 len2 = SZ_1M; +	u64 logical3 = logical2 + len2; +	u64 len3 = SZ_1M; +	u64 hole_start = logical1 + SZ_256K; +	u64 hole_len = SZ_2M; +	int ret; + +	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); +	if (!bioc) { +		test_std_err(TEST_ALLOC_IO_CONTEXT); +		ret = -ENOMEM; +		goto out; +	} + +	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + +	/* Prepare for the test, 1st create 3 x 1M extents. */ +	bioc->map_type = map_type; +	bioc->size = len1; + +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical1 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	bioc->logical = logical2; +	bioc->size = len2; +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical2 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	bioc->logical = logical3; +	bioc->size = len3; +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical3 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	/* +	 * Delete a range starting at logical1 + 256K and 2M in length. Extent +	 * 1 is truncated to 256k length, extent 2 is completely dropped and +	 * extent 3 is moved 256K to the right. +	 */ +	ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 hole_start, hole_start + hole_len); +		goto out; +	} + +	/* Get the first extent and check its size. */ +	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, +					   0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", +			 logical1, logical1 + len1); +		goto out; +	} + +	if (io_stripe.physical != logical1) { +		test_err("invalid physical address, expected %llu, got %llu", +			 logical1, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len1 != SZ_256K) { +		test_err("invalid stripe length, expected %llu, got %llu", +			 (u64)SZ_256K, len1); +		ret = -EINVAL; +		goto out; +	} + +	/* Get the second extent and check it's absent. */ +	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, +					   0, &io_stripe); +	if (ret != -ENODATA) { +		test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", +			 logical2, logical2 + len2); +		ret = -EINVAL; +		goto out; +	} + +	/* Get the third extent and check its size. */ +	logical3 += SZ_256K; +	ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, +					   0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", +			 logical3, logical3 + len3); +		goto out; +	} + +	if (io_stripe.physical != logical3) { +		test_err("invalid physical address, expected %llu, got %llu", +			 logical3 + SZ_256K, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len3 != SZ_1M - SZ_256K) { +		test_err("invalid stripe length, expected %llu, got %llu", +			 (u64)SZ_1M - SZ_256K, len3); +		ret = -EINVAL; +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, logical1, len1); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 logical1, logical1 + len1); +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, logical3, len3); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 logical1, logical1 + len1); +		goto out; +	} + +out: +	btrfs_put_bioc(bioc); +	return ret; +} + +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_io_context *bioc; +	struct btrfs_io_stripe io_stripe = { 0 }; +	u64 map_type = RST_TEST_RAID1_TYPE; +	u64 logical1 = SZ_1M; +	u64 len1 = SZ_1M; +	u64 logical2 = logical1 + len1; +	u64 len2 = SZ_1M; +	u64 logical3 = logical2 + len2; +	u64 len3 = SZ_1M; +	int ret; + +	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); +	if (!bioc) { +		test_std_err(TEST_ALLOC_IO_CONTEXT); +		ret = -ENOMEM; +		goto out; +	} + +	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + +	/* Prepare for the test, 1st create 3 x 1M extents. */ +	bioc->map_type = map_type; +	bioc->size = len1; + +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical1 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	bioc->logical = logical2; +	bioc->size = len2; +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical2 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	bioc->logical = logical3; +	bioc->size = len3; +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical3 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	/* +	 * Delete a range starting at logical1 and 2M in length. Extents 1 +	 * and 2 are dropped and extent 3 is kept as is. +	 */ +	ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 logical1, logical1 + len1 + len2); +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, +					   0, &io_stripe); +	if (ret != -ENODATA) { +		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", +			 logical1, len1); +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, +					   0, &io_stripe); +	if (ret != -ENODATA) { +		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", +			 logical2, len2); +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, +					   0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", +			 logical3, len3); +		goto out; +	} + +	if (io_stripe.physical != logical3) { +		test_err("invalid physical address, expected %llu, got %llu", +			 logical3, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len3 != SZ_1M) { +		test_err("invalid stripe length, expected %llu, got %llu", +			 (u64)SZ_1M, len3); +		ret = -EINVAL; +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: +	btrfs_put_bioc(bioc); +	return ret; +} + +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_io_context *bioc; +	struct btrfs_io_stripe io_stripe = { 0 }; +	u64 map_type = RST_TEST_RAID1_TYPE; +	u64 logical1 = SZ_1M; +	u64 hole_start = logical1 + SZ_32K; +	u64 hole_len = SZ_64K; +	u64 logical2 = hole_start + hole_len; +	u64 len = SZ_1M; +	u64 len1 = SZ_32K; +	u64 len2 = len - len1 - hole_len; +	int ret; + +	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); +	if (!bioc) { +		test_std_err(TEST_ALLOC_IO_CONTEXT); +		ret = -ENOMEM; +		goto out; +	} + +	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); +	bioc->map_type = map_type; +	bioc->size = len; + +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical1 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, +					   &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", logical1, +			 logical1 + len); +		goto out; +	} + +	if (io_stripe.physical != logical1) { +		test_err("invalid physical address, expected %llu got %llu", +			 logical1, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len != SZ_1M) { +		test_err("invalid stripe length, expected %llu got %llu", +			 (u64)SZ_1M, len); +		ret = -EINVAL; +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 hole_start, hole_start + hole_len); +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, +					   0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", +			 logical1, logical1 + len1); +		goto out; +	} + +	if (io_stripe.physical != logical1) { +		test_err("invalid physical address, expected %llu, got %llu", +			 logical1, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len1 != SZ_32K) { +		test_err("invalid stripe length, expected %llu, got %llu", +			 (u64)SZ_32K, len1); +		ret = -EINVAL; +		goto out; +	} + +	ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, +					   0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", logical2, +			 logical2 + len2); +		goto out; +	} + +	if (io_stripe.physical != logical2) { +		test_err("invalid physical address, expected %llu, got %llu", +			 logical2, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len2 != len - len1 - hole_len) { +		test_err("invalid length, expected %llu, got %llu", +			 len - len1 - hole_len, len2); +		ret = -EINVAL; +		goto out; +	} + +	/* Check for the absence of the hole. */ +	ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, +					   map_type, 0, &io_stripe); +	if (ret != -ENODATA) { +		ret = -EINVAL; +		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", +			 hole_start, hole_start + SZ_64K); +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, logical1, len1); +	if (ret) +		goto out; + +	ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: +	btrfs_put_bioc(bioc); +	return ret; +} + +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ +	struct btrfs_fs_info *fs_info = trans->fs_info; +	struct btrfs_io_context *bioc; +	struct btrfs_io_stripe io_stripe = { 0 }; +	u64 map_type = RST_TEST_RAID1_TYPE; +	u64 logical1 = SZ_1M; +	u64 logical2 = SZ_2M; +	u64 len = SZ_1M; +	int ret; + +	bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); +	if (!bioc) { +		test_std_err(TEST_ALLOC_IO_CONTEXT); +		ret = -ENOMEM; +		goto out; +	} + +	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); +	bioc->map_type = map_type; +	bioc->size = len; + +	/* Insert RAID extent 1. */ +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical1 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	bioc->logical = logical2; +	/* Insert RAID extent 2, directly adjacent to it. */ +	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { +		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + +		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); +		if (!stripe->dev) { +			test_err("cannot find device with devid %d", i); +			ret = -EINVAL; +			goto out; +		} + +		stripe->physical = logical2 + i * SZ_1G; +	} + +	ret = btrfs_insert_one_raid_extent(trans, bioc); +	if (ret) { +		test_err("inserting RAID extent failed: %d", ret); +		goto out; +	} + +	ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); +	if (ret) { +		test_err("deleting RAID extent [%llu, %llu] failed", +			 logical1 + SZ_512K, (u64)SZ_1M); +		goto out; +	} + +	/* Verify item 1 is truncated to 512K. */ +	ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, +					   &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", logical1, +			 logical1 + len); +		goto out; +	} + +	if (io_stripe.physical != logical1) { +		test_err("invalid physical address, expected %llu got %llu", +			 logical1, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len != SZ_512K) { +		test_err("invalid stripe length, expected %llu got %llu", +			 (u64)SZ_512K, len); +		ret = -EINVAL; +		goto out; +	} + +	/* Verify item 2's start is moved by 512K. */ +	ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, +					   map_type, 0, &io_stripe); +	if (ret) { +		test_err("lookup of RAID extent [%llu, %llu] failed", +			 logical2 + SZ_512K, logical2 + len); +		goto out; +	} + +	if (io_stripe.physical != logical2 + SZ_512K) { +		test_err("invalid physical address, expected %llu got %llu", +			 logical2 + SZ_512K, io_stripe.physical); +		ret = -EINVAL; +		goto out; +	} + +	if (len != SZ_512K) { +		test_err("invalid stripe length, expected %llu got %llu", +			 (u64)SZ_512K, len); +		ret = -EINVAL; +		goto out; +	} + +	/* Verify there's a hole at [1M+512K, 2M+512K] . */ +	len = SZ_1M; +	ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, +					   map_type, 0, &io_stripe); +	if (ret != -ENODATA) { +		test_err("lookup of RAID [%llu, %llu] succeeded, should fail", +			 logical1 + SZ_512K, logical1 + SZ_512K + len); +		goto out; +	} + +	/* Clean up after us. */ +	ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); +	if (ret) +		goto out; + +	ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: +	btrfs_put_bioc(bioc); +	return ret; +} + +/*   * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then   * delete the 1st 32K, making the new start address 1M+32K.   */ @@ -94,45 +703,45 @@ static int test_front_delete(struct btrfs_trans_handle *trans)  		goto out;  	} -	ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); +	ret = btrfs_delete_raid_extent(trans, logical, SZ_16K);  	if (ret) {  		test_err("deleting RAID extent [%llu, %llu] failed", logical, -			 logical + SZ_32K); +			 logical + SZ_16K);  		goto out;  	} -	len = SZ_32K; -	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, +	len -= SZ_16K; +	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len,  					   map_type, 0, &io_stripe);  	if (ret) {  		test_err("lookup of RAID extent [%llu, %llu] failed", -			 logical + SZ_32K, logical + SZ_32K + len); +			 logical + SZ_16K, logical + SZ_64K);  		goto out;  	} -	if (io_stripe.physical != logical + SZ_32K) { +	if (io_stripe.physical != logical + SZ_16K) {  		test_err("invalid physical address, expected %llu, got %llu", -			 logical + SZ_32K, io_stripe.physical); +			 logical + SZ_16K, io_stripe.physical);  		ret = -EINVAL;  		goto out;  	} -	if (len != SZ_32K) { +	if (len != SZ_48K) {  		test_err("invalid stripe length, expected %llu, got %llu", -			 (u64)SZ_32K, len); +			 (u64)SZ_48K, len);  		ret = -EINVAL;  		goto out;  	}  	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); -	if (!ret) { +	if (ret != -ENODATA) {  		ret = -EINVAL;  		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", -			 logical, logical + SZ_32K); +			 logical, logical + SZ_16K);  		goto out;  	} -	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); +	ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K);  out:  	btrfs_put_bioc(bioc);  	return ret; @@ -209,14 +818,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)  		goto out;  	} -	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); +	ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K);  	if (ret) {  		test_err("deleting RAID extent [%llu, %llu] failed", -			 logical + SZ_32K, logical + SZ_64K); +			 logical + SZ_48K, logical + SZ_64K);  		goto out;  	} -	len = SZ_32K; +	len = SZ_48K;  	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);  	if (ret) {  		test_err("lookup of RAID extent [%llu, %llu] failed", logical, @@ -231,9 +840,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)  		goto out;  	} -	if (len != SZ_32K) { +	if (len != SZ_48K) {  		test_err("invalid stripe length, expected %llu, got %llu", -			 (u64)SZ_32K, len); +			 (u64)SZ_48K, len); +		ret = -EINVAL; +		goto out; +	} + +	len = SZ_16K; +	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, +					   map_type, 0, &io_stripe); +	if (ret != -ENODATA) { +		test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", +			 logical + SZ_48K, logical + SZ_64K);  		ret = -EINVAL;  		goto out;  	} @@ -456,6 +1075,10 @@ static const test_func_t tests[] = {  	test_create_update_delete,  	test_tail_delete,  	test_front_delete, +	test_front_delete_prev_item, +	test_punch_hole, +	test_punch_hole_3extents, +	test_delete_two_extents,  };  static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) @@ -478,8 +1101,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)  		ret = PTR_ERR(root);  		goto out;  	} -	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, -					BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); +	btrfs_set_super_incompat_flags(root->fs_info->super_copy, +				       BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);  	root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;  	root->root_key.type = BTRFS_ROOT_ITEM_KEY;  	root->root_key.offset = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dc0b837efd5d..15312013f2a3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -795,8 +795,7 @@ alloc_fail:  	if (num_bytes)  		btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);  	if (delayed_refs_bytes) -		btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, -						    delayed_refs_bytes); +		btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);  reserve_fail:  	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);  	return ERR_PTR(ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 184fa5c0062a..9f7c777af635 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)  	delayed_refs->qgroup_to_skip = 0;  } -bool __cold abort_should_print_stack(int error); +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +static inline bool btrfs_abort_should_print_stack(int error) +{ +	switch (error) { +	case -EIO: +	case -EROFS: +	case -ENOMEM: +		return false; +	} +	return true; +}  /*   * Call btrfs_abort_transaction as early as possible when an error condition is @@ -240,7 +254,7 @@ do {								\  	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\  			&((trans)->fs_info->fs_state))) {	\  		__first = true;					\ -		if (WARN(abort_should_print_stack(error),	\ +		if (WARN(btrfs_abort_should_print_stack(error),	\  			KERN_ERR				\  			"BTRFS: Transaction aborted (error %d)\n",	\  			(error))) {					\ diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index dfeee033f31f..43979891f7c8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf,  	return 0;  } -__printf(4, 5) +__printf(5, 6)  __cold -static void chunk_err(const struct extent_buffer *leaf, +static void chunk_err(const struct btrfs_fs_info *fs_info, +		      const struct extent_buffer *leaf,  		      const struct btrfs_chunk *chunk, u64 logical,  		      const char *fmt, ...)  { -	const struct btrfs_fs_info *fs_info = leaf->fs_info; -	bool is_sb; +	bool is_sb = !leaf;  	struct va_format vaf;  	va_list args;  	int i;  	int slot = -1; -	/* Only superblock eb is able to have such small offset */ -	is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); -  	if (!is_sb) {  		/*  		 * Get the slot number by iterating through all slots, this @@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf,  /*   * The common chunk check which could also work on super block sys chunk array.   * + * If @leaf is NULL, then @chunk must be an on-stack chunk item. + * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) + *   * Return -EUCLEAN if anything is corrupted.   * Return 0 if everything is OK.   */ -int btrfs_check_chunk_valid(struct extent_buffer *leaf, -			    struct btrfs_chunk *chunk, u64 logical) +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, +			    const struct extent_buffer *leaf, +			    const struct btrfs_chunk *chunk, u64 logical, +			    u32 sectorsize)  { -	struct btrfs_fs_info *fs_info = leaf->fs_info;  	u64 length;  	u64 chunk_end;  	u64 stripe_len; @@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  	u16 sub_stripes;  	u64 type;  	u64 features; +	u32 chunk_sector_size;  	bool mixed = false;  	int raid_index;  	int nparity;  	int ncopies; -	length = btrfs_chunk_length(leaf, chunk); -	stripe_len = btrfs_chunk_stripe_len(leaf, chunk); -	num_stripes = btrfs_chunk_num_stripes(leaf, chunk); -	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); -	type = btrfs_chunk_type(leaf, chunk); +	if (leaf) { +		length = btrfs_chunk_length(leaf, chunk); +		stripe_len = btrfs_chunk_stripe_len(leaf, chunk); +		num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +		sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); +		type = btrfs_chunk_type(leaf, chunk); +		chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); +	} else { +		length = btrfs_stack_chunk_length(chunk); +		stripe_len = btrfs_stack_chunk_stripe_len(chunk); +		num_stripes = btrfs_stack_chunk_num_stripes(chunk); +		sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); +		type = btrfs_stack_chunk_type(chunk); +		chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); +	}  	raid_index = btrfs_bg_flags_to_raid_index(type);  	ncopies = btrfs_raid_array[raid_index].ncopies;  	nparity = btrfs_raid_array[raid_index].nparity;  	if (unlikely(!num_stripes)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk num_stripes, have %u", num_stripes);  		return -EUCLEAN;  	}  	if (unlikely(num_stripes < ncopies)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk num_stripes < ncopies, have %u < %d",  			  num_stripes, ncopies);  		return -EUCLEAN;  	}  	if (unlikely(nparity && num_stripes == nparity)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk num_stripes == nparity, have %u == %d",  			  num_stripes, nparity);  		return -EUCLEAN;  	} -	if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { -		chunk_err(leaf, chunk, logical, +	if (unlikely(!IS_ALIGNED(logical, sectorsize))) { +		chunk_err(fs_info, leaf, chunk, logical,  		"invalid chunk logical, have %llu should aligned to %u", -			  logical, fs_info->sectorsize); +			  logical, sectorsize);  		return -EUCLEAN;  	} -	if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { -		chunk_err(leaf, chunk, logical, +	if (unlikely(chunk_sector_size != sectorsize)) { +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk sectorsize, have %u expect %u", -			  btrfs_chunk_sector_size(leaf, chunk), -			  fs_info->sectorsize); +			  chunk_sector_size, sectorsize);  		return -EUCLEAN;  	} -	if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { -		chunk_err(leaf, chunk, logical, +	if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk length, have %llu", length);  		return -EUCLEAN;  	}  	if (unlikely(check_add_overflow(logical, length, &chunk_end))) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  "invalid chunk logical start and length, have logical start %llu length %llu",  			  logical, length);  		return -EUCLEAN;  	}  	if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "invalid chunk stripe length: %llu",  			  stripe_len);  		return -EUCLEAN; @@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  	 * Thus it should be a good way to catch obvious bitflips.  	 */  	if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "chunk length too large: have %llu limit %llu",  			  length, btrfs_stripe_nr_to_offset(U32_MAX));  		return -EUCLEAN;  	}  	if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |  			      BTRFS_BLOCK_GROUP_PROFILE_MASK))) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "unrecognized chunk type: 0x%llx",  			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK | -			    BTRFS_BLOCK_GROUP_PROFILE_MASK) & -			  btrfs_chunk_type(leaf, chunk)); +			    BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);  		return -EUCLEAN;  	}  	if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&  		     (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  		"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",  			  type & BTRFS_BLOCK_GROUP_PROFILE_MASK);  		return -EUCLEAN;  	}  	if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  	"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",  			  type, BTRFS_BLOCK_GROUP_TYPE_MASK);  		return -EUCLEAN; @@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  	if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&  		     (type & (BTRFS_BLOCK_GROUP_METADATA |  			      BTRFS_BLOCK_GROUP_DATA)))) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			  "system chunk with data or metadata type: 0x%llx",  			  type);  		return -EUCLEAN; @@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  	if (!mixed) {  		if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&  			     (type & BTRFS_BLOCK_GROUP_DATA))) { -			chunk_err(leaf, chunk, logical, +			chunk_err(fs_info, leaf, chunk, logical,  			"mixed chunk type in non-mixed mode: 0x%llx", type);  			return -EUCLEAN;  		} @@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  		      num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||  		     ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&  		      num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { -		chunk_err(leaf, chunk, logical, +		chunk_err(fs_info, leaf, chunk, logical,  			"invalid num_stripes:sub_stripes %u:%u for profile %llu",  			num_stripes, sub_stripes,  			type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,  				 struct btrfs_chunk *chunk,  				 struct btrfs_key *key, int slot)  { +	struct btrfs_fs_info *fs_info = leaf->fs_info;  	int num_stripes;  	if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { -		chunk_err(leaf, chunk, key->offset, +		chunk_err(fs_info, leaf, chunk, key->offset,  			"invalid chunk item size: have %u expect [%zu, %u)",  			btrfs_item_size(leaf, slot),  			sizeof(struct btrfs_chunk), -			BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); +			BTRFS_LEAF_DATA_SIZE(fs_info));  		return -EUCLEAN;  	} @@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,  	if (unlikely(btrfs_chunk_item_size(num_stripes) !=  		     btrfs_item_size(leaf, slot))) { -		chunk_err(leaf, chunk, key->offset, +		chunk_err(fs_info, leaf, chunk, key->offset,  			"invalid chunk item size: have %u expect %lu",  			btrfs_item_size(leaf, slot),  			btrfs_chunk_item_size(num_stripes));  		return -EUCLEAN;  	}  out: -	return btrfs_check_chunk_valid(leaf, chunk, key->offset); +	return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, +				       fs_info->sectorsize);  }  __printf(3, 4) diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index db67f96cbe4b..eb201f4ec3c7 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -10,6 +10,7 @@  #include <uapi/linux/btrfs_tree.h>  struct extent_buffer; +struct btrfs_fs_info;  struct btrfs_chunk;  struct btrfs_key; @@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node);  int btrfs_check_leaf(struct extent_buffer *leaf);  int btrfs_check_node(struct extent_buffer *node); -int btrfs_check_chunk_valid(struct extent_buffer *leaf, -			    struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, +			    const struct extent_buffer *leaf, +			    const struct btrfs_chunk *chunk, u64 logical, +			    u32 sectorsize);  int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);  int btrfs_verify_level_key(struct extent_buffer *eb,  			   const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c8d6587688b3..955d1677e865 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -590,7 +590,6 @@ insert:  		}  	}  no_copy: -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	btrfs_release_path(path);  	return 0;  } @@ -3588,7 +3587,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,  		last_offset = max(last_offset, curr_end);  	}  	btrfs_set_dir_log_end(path->nodes[0], item, last_offset); -	btrfs_mark_buffer_dirty(trans, path->nodes[0]);  	btrfs_release_path(path);  	return 0;  } @@ -4566,7 +4564,6 @@ copy_item:  		dst_index++;  	} -	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);  	btrfs_release_path(dst_path);  out:  	kfree(ins_data); @@ -4776,7 +4773,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,  	write_extent_buffer(leaf, &fi,  			    btrfs_item_ptr_offset(leaf, path->slots[0]),  			    sizeof(fi)); -	btrfs_mark_buffer_dirty(trans, leaf);  	btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index aca2861f2187..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ  	ret = 0;  	subid_le = cpu_to_le64(subid_cpu);  	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); -	btrfs_mark_buffer_dirty(trans, eb); -  out:  	btrfs_free_path(path);  	return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3d0ac8bdb21f..0a0776489055 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,8 +13,8 @@  #include <linux/list_sort.h>  #include <linux/namei.h>  #include "misc.h" -#include "ctree.h"  #include "disk-io.h" +#include "extent-tree.h"  #include "transaction.h"  #include "volumes.h"  #include "raid56.h" @@ -48,6 +48,7 @@ struct btrfs_io_geometry {  	u64 raid56_full_stripe_start;  	int max_errors;  	enum btrfs_map_op op; +	bool use_rst;  };  const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -1302,6 +1303,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  	struct btrfs_device *device;  	struct btrfs_device *latest_dev = NULL;  	struct btrfs_device *tmp_device; +	s64 __maybe_unused value = 0;  	int ret = 0;  	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1331,7 +1333,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  	fs_devices->latest_dev = latest_dev;  	fs_devices->total_rw_bytes = 0;  	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +	fs_devices->read_devid = latest_dev->devid; +	fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), +							    &value); +	if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) +		fs_devices->collect_fs_stats = true; + +	if (value) { +		if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) +			fs_devices->rr_min_contig_read = value; +		if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) +			fs_devices->read_devid = value; +	} +#else  	fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif  	return 0;  } @@ -2049,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,  	ptr = btrfs_device_fsid(dev_item);  	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,  			    ptr, BTRFS_FSID_SIZE); -	btrfs_mark_buffer_dirty(trans, leaf);  	ret = 0;  out: @@ -2745,11 +2762,9 @@ next_slot:  		device = btrfs_find_device(fs_info->fs_devices, &args);  		BUG_ON(!device); /* Logic error */ -		if (device->fs_devices->seeding) { +		if (device->fs_devices->seeding)  			btrfs_set_device_generation(leaf, dev_item,  						    device->generation); -			btrfs_mark_buffer_dirty(trans, leaf); -		}  		path->slots[0]++;  		goto next_slot; @@ -3042,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,  				     btrfs_device_get_disk_total_bytes(device));  	btrfs_set_device_bytes_used(leaf, dev_item,  				    btrfs_device_get_bytes_used(device)); -	btrfs_mark_buffer_dirty(trans, leaf); -  out:  	btrfs_free_path(path);  	return ret; @@ -3752,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,  	btrfs_set_balance_meta(leaf, item, &disk_bargs);  	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);  	btrfs_set_balance_sys(leaf, item, &disk_bargs); -  	btrfs_set_balance_flags(leaf, item, bctl->flags); - -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	err = btrfs_commit_transaction(trans); @@ -5517,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma  	btrfs_free_chunk_map(map);  } +static int btrfs_chunk_map_cmp(const struct rb_node *new, +			       const struct rb_node *exist) +{ +	const struct btrfs_chunk_map *new_map = +		rb_entry(new, struct btrfs_chunk_map, rb_node); +	const struct btrfs_chunk_map *exist_map = +		rb_entry(exist, struct btrfs_chunk_map, rb_node); + +	if (new_map->start == exist_map->start) +		return 0; +	if (new_map->start < exist_map->start) +		return -1; +	return 1; +} +  EXPORT_FOR_TESTS  int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)  { -	struct rb_node **p; -	struct rb_node *parent = NULL; -	bool leftmost = true; +	struct rb_node *exist;  	write_lock(&fs_info->mapping_tree_lock); -	p = &fs_info->mapping_tree.rb_root.rb_node; -	while (*p) { -		struct btrfs_chunk_map *entry; - -		parent = *p; -		entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - -		if (map->start < entry->start) { -			p = &(*p)->rb_left; -		} else if (map->start > entry->start) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else { -			write_unlock(&fs_info->mapping_tree_lock); -			return -EEXIST; -		} +	exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, +				   btrfs_chunk_map_cmp); + +	if (exist) { +		write_unlock(&fs_info->mapping_tree_lock); +		return -EEXIST;  	} -	rb_link_node(&map->rb_node, parent, p); -	rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);  	chunk_map_device_set_bits(map, CHUNK_ALLOCATED);  	chunk_map_device_clear_bits(map, CHUNK_TRIMMED);  	write_unlock(&fs_info->mapping_tree_lock); @@ -5963,6 +5974,76 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,  	return len;  } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ +	for (int index = first; index < first + num_stripes; index++) { +		const struct btrfs_device *device = map->stripes[index].dev; + +		if (device->devid == READ_ONCE(device->fs_devices->read_devid)) +			return index; +	} + +	/* If no read-preferred device is set use the first stripe. */ +	return first; +} + +struct stripe_mirror { +	u64 devid; +	int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ +	const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; +	const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + +	if (s1->devid < s2->devid) +		return -1; +	if (s1->devid > s2->devid) +		return 1; +	return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + *  1. Compute the read cycle as the total sectors read divided by the minimum + *     sectors per device. + *  2. Determine the stripe number for the current read by taking the modulus + *     of the read cycle with the total number of stripes: + * + *      stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ +	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; +	struct btrfs_device *device  = map->stripes[first].dev; +	struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; +	unsigned int read_cycle; +	unsigned int total_reads; +	unsigned int min_reads_per_dev; + +	total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); +	min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> +						       fs_info->sectorsize_bits; + +	for (int index = 0, i = first; i < first + num_stripes; i++) { +		stripes[index].devid = map->stripes[i].dev->devid; +		stripes[index].num = i; +		index++; +	} +	sort(stripes, num_stripes, sizeof(struct stripe_mirror), +	     btrfs_cmp_devid, NULL); + +	read_cycle = total_reads / min_reads_per_dev; +	return stripes[read_cycle % num_stripes].num; +} +#endif +  static int find_live_mirror(struct btrfs_fs_info *fs_info,  			    struct btrfs_chunk_map *map, int first,  			    int dev_replace_is_ongoing) @@ -5992,6 +6073,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,  	case BTRFS_READ_POLICY_PID:  		preferred_mirror = first + (current->pid % num_stripes);  		break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	case BTRFS_READ_POLICY_RR: +		preferred_mirror = btrfs_read_rr(map, first, num_stripes); +		break; +	case BTRFS_READ_POLICY_DEVID: +		preferred_mirror = btrfs_read_preferred(map, first, num_stripes); +		break; +#endif  	}  	if (dev_replace_is_ongoing && @@ -6350,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,  {  	dst->dev = map->stripes[io_geom->stripe_index].dev; -	if (io_geom->op == BTRFS_MAP_READ && -	    btrfs_need_stripe_tree_update(fs_info, map->type)) +	if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)  		return btrfs_get_raid_extent_offset(fs_info, logical, length,  						    map->type,  						    io_geom->stripe_index, dst); @@ -6366,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,  				const struct btrfs_io_stripe *smap,  				const struct btrfs_chunk_map *map,  				int num_alloc_stripes, -				enum btrfs_map_op op, int mirror_num) +				struct btrfs_io_geometry *io_geom)  {  	if (!smap)  		return false; @@ -6374,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,  	if (num_alloc_stripes != 1)  		return false; -	if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) +	if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)  		return false; -	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) +	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)  		return false;  	return true; @@ -6583,6 +6671,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	io_geom.raid56_full_stripe_start = (u64)-1;  	max_len = btrfs_max_io_len(map, map_offset, &io_geom);  	*length = min_t(u64, map->chunk_len - map_offset, max_len); +	io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);  	if (dev_replace->replace_task != current)  		down_read(&dev_replace->rwsem); @@ -6651,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	 * physical block information on the stack instead of allocating an  	 * I/O context structure.  	 */ -	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, -				io_geom.mirror_num)) { +	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {  		ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);  		if (mirror_num_ret)  			*mirror_num_ret = io_geom.mirror_num; @@ -6666,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  		goto out;  	}  	bioc->map_type = map->type; +	bioc->use_rst = io_geom.use_rst;  	/*  	 * For RAID56 full map, we need to make sure the stripes[] follows the @@ -7006,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,  	warn_32bit_meta_chunk(fs_info, logical, length, type);  #endif -	/* -	 * Only need to verify chunk item if we're reading from sys chunk array, -	 * as chunk item in tree block is already verified by tree-checker. -	 */ -	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { -		ret = btrfs_check_chunk_valid(leaf, chunk, logical); -		if (ret) -			return ret; -	} -  	map = btrfs_find_chunk_map(fs_info, logical, 1);  	/* already mapped? */ @@ -7273,16 +7352,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  {  	struct btrfs_super_block *super_copy = fs_info->super_copy;  	struct extent_buffer *sb; -	struct btrfs_disk_key *disk_key; -	struct btrfs_chunk *chunk;  	u8 *array_ptr;  	unsigned long sb_array_offset;  	int ret = 0; -	u32 num_stripes;  	u32 array_size; -	u32 len = 0;  	u32 cur_offset; -	u64 type;  	struct btrfs_key key;  	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7305,10 +7379,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  	cur_offset = 0;  	while (cur_offset < array_size) { -		disk_key = (struct btrfs_disk_key *)array_ptr; -		len = sizeof(*disk_key); -		if (cur_offset + len > array_size) -			goto out_short_read; +		struct btrfs_chunk *chunk; +		struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; +		u32 len = sizeof(*disk_key); + +		/* +		 * The sys_chunk_array has been already verified at super block +		 * read time.  Only do ASSERT()s for basic checks. +		 */ +		ASSERT(cur_offset + len <= array_size);  		btrfs_disk_key_to_cpu(&key, disk_key); @@ -7316,44 +7395,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  		sb_array_offset += len;  		cur_offset += len; -		if (key.type != BTRFS_CHUNK_ITEM_KEY) { -			btrfs_err(fs_info, -			    "unexpected item type %u in sys_array at offset %u", -				  (u32)key.type, cur_offset); -			ret = -EIO; -			break; -		} +		ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);  		chunk = (struct btrfs_chunk *)sb_array_offset; -		/* -		 * At least one btrfs_chunk with one stripe must be present, -		 * exact stripe count check comes afterwards -		 */ -		len = btrfs_chunk_item_size(1); -		if (cur_offset + len > array_size) -			goto out_short_read; - -		num_stripes = btrfs_chunk_num_stripes(sb, chunk); -		if (!num_stripes) { -			btrfs_err(fs_info, -			"invalid number of stripes %u in sys_array at offset %u", -				  num_stripes, cur_offset); -			ret = -EIO; -			break; -		} +		ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); -		type = btrfs_chunk_type(sb, chunk); -		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { -			btrfs_err(fs_info, -			"invalid chunk type %llu in sys_array at offset %u", -				  type, cur_offset); -			ret = -EIO; -			break; -		} +		len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); -		len = btrfs_chunk_item_size(num_stripes); -		if (cur_offset + len > array_size) -			goto out_short_read; +		ASSERT(cur_offset + len <= array_size);  		ret = read_one_chunk(&key, sb, chunk);  		if (ret) @@ -7366,13 +7415,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  	clear_extent_buffer_uptodate(sb);  	free_extent_buffer_stale(sb);  	return ret; - -out_short_read: -	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", -			len, cur_offset); -	clear_extent_buffer_uptodate(sb); -	free_extent_buffer_stale(sb); -	return -EIO;  }  /* @@ -7572,8 +7614,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)  	struct btrfs_device *device;  	int ret = 0; -	fs_devices->fs_info = fs_info; -  	mutex_lock(&fs_devices->device_list_mutex);  	list_for_each_entry(device, &fs_devices->devices, dev_list)  		device->fs_info = fs_info; @@ -7749,8 +7789,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,  	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)  		btrfs_set_dev_stats_value(eb, ptr, i,  					  btrfs_dev_stat_read(device, i)); -	btrfs_mark_buffer_dirty(trans, eb); -  out:  	btrfs_free_path(path);  	return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3a416b1bc24c..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy {  	BTRFS_CHUNK_ALLOC_ZONED,  }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ	(SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS			(4)  /*   * Read policies for mirrored block group profiles, read picks the stripe based   * on these policies. @@ -303,6 +306,12 @@ enum btrfs_chunk_allocation_policy {  enum btrfs_read_policy {  	/* Use process PID to choose the stripe */  	BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	/* Balancing RAID1 reads across all striped devices (round-robin). */ +	BTRFS_READ_POLICY_RR, +	/* Read from a specific device. */ +	BTRFS_READ_POLICY_DEVID, +#endif  	BTRFS_NR_READ_POLICY,  }; @@ -417,6 +426,8 @@ struct btrfs_fs_devices {  	bool seeding;  	/* The mount needs to use a randomly generated fsid. */  	bool temp_fsid; +	/* Enable/disable the filesystem stats tracking. */ +	bool collect_fs_stats;  	struct btrfs_fs_info *fs_info;  	/* sysfs kobjects */ @@ -431,6 +442,15 @@ struct btrfs_fs_devices {  	enum btrfs_read_policy read_policy;  #ifdef CONFIG_BTRFS_EXPERIMENTAL +	/* +	 * Minimum contiguous reads before switching to next device, the unit +	 * is one block/sectorsize. +	 */ +	u32 rr_min_contig_read; + +	/* Device to be used for reading in case of RAID1. */ +	u64 read_devid; +  	/* Checksum mode - offload it or do it synchronously. */  	enum btrfs_offload_csum_mode offload_csum_mode;  #endif @@ -485,6 +505,7 @@ struct btrfs_io_context {  	struct bio *orig_bio;  	atomic_t error;  	u16 max_errors; +	bool use_rst;  	u64 logical;  	u64 size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index bc18710d1dcf..3e0edbcf73e1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,  		btrfs_set_dir_data_len(leaf, di, size);  		data_ptr = ((unsigned long)(di + 1)) + name_len;  		write_extent_buffer(leaf, value, data_ptr, size); -		btrfs_mark_buffer_dirty(trans, leaf);  	} else {  		/*  		 * Insert, and we had space for the xattr, so path->slots[0] is diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index df905ae82929..73e0aa9fc08a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2652,3 +2652,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)  	}  	spin_unlock(&fs_info->zone_active_bgs_lock);  } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info:	the space to work on + * @num_bytes:	targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ +	struct btrfs_fs_info *fs_info = space_info->fs_info; +	const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + +	if (!btrfs_is_zoned(fs_info)) +		return 0; + +	while (num_bytes > 0) { +		struct btrfs_chunk_map *map; +		struct btrfs_block_group *bg = NULL; +		bool found = false; +		u64 reclaimed = 0; + +		/* +		 * Here, we choose a fully zone_unusable block group. It's +		 * technically possible to reset a partly zone_unusable block +		 * group, which still has some free space left. However, +		 * handling that needs to cope with the allocation side, which +		 * makes the logic more complex. So, let's handle the easy case +		 * for now. +		 */ +		spin_lock(&fs_info->unused_bgs_lock); +		list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { +			if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) +				continue; + +			/* +			 * Use trylock to avoid locking order violation. In +			 * btrfs_reclaim_bgs_work(), the lock order is +			 * &bg->lock -> &fs_info->unused_bgs_lock. We skip a +			 * block group if we cannot take its lock. +			 */ +			if (!spin_trylock(&bg->lock)) +				continue; +			if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { +				spin_unlock(&bg->lock); +				continue; +			} +			spin_unlock(&bg->lock); +			found = true; +			break; +		} +		if (!found) { +			spin_unlock(&fs_info->unused_bgs_lock); +			return 0; +		} + +		list_del_init(&bg->bg_list); +		btrfs_put_block_group(bg); +		spin_unlock(&fs_info->unused_bgs_lock); + +		/* +		 * Since the block group is fully zone_unusable and we cannot +		 * allocate from this block group anymore, we don't need to set +		 * this block group read-only. +		 */ + +		down_read(&fs_info->dev_replace.rwsem); +		map = bg->physical_map; +		for (int i = 0; i < map->num_stripes; i++) { +			struct btrfs_io_stripe *stripe = &map->stripes[i]; +			unsigned int nofs_flags; +			int ret; + +			nofs_flags = memalloc_nofs_save(); +			ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, +					       stripe->physical >> SECTOR_SHIFT, +					       zone_size_sectors); +			memalloc_nofs_restore(nofs_flags); + +			if (ret) { +				up_read(&fs_info->dev_replace.rwsem); +				return ret; +			} +		} +		up_read(&fs_info->dev_replace.rwsem); + +		spin_lock(&space_info->lock); +		spin_lock(&bg->lock); +		ASSERT(!btrfs_is_block_group_used(bg)); +		if (bg->ro) { +			spin_unlock(&bg->lock); +			spin_unlock(&space_info->lock); +			continue; +		} + +		reclaimed = bg->alloc_offset; +		bg->zone_unusable = bg->length - bg->zone_capacity; +		bg->alloc_offset = 0; +		/* +		 * This holds because we currently reset fully used then freed +		 * block group. +		 */ +		ASSERT(reclaimed == bg->zone_capacity); +		bg->free_space_ctl->free_space += reclaimed; +		space_info->bytes_zone_unusable -= reclaimed; +		spin_unlock(&bg->lock); +		btrfs_return_free_space(space_info, reclaimed); +		spin_unlock(&space_info->lock); + +		if (num_bytes <= reclaimed) +			break; +		num_bytes -= reclaimed; +	} + +	return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7612e6572605..9672bf4c3335 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);  int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,  				struct btrfs_space_info *space_info, bool do_finish);  void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);  #else /* CONFIG_BLK_DEV_ZONED */  static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,  static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, +						  u64 num_bytes) +{ +	return 0; +} +  #endif  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) | 
