diff options
| author | Jonathan Cameron <Jonathan.Cameron@huawei.com> | 2025-09-13 15:00:48 +0100 | 
|---|---|---|
| committer | Jonathan Cameron <Jonathan.Cameron@huawei.com> | 2025-09-13 15:00:48 +0100 | 
| commit | 421d4487ef2ead206f57a8950ea9bdd1f7a7b39a (patch) | |
| tree | b679546eeaa70f88b5c8d6caeb79e214efb68121 /fs/btrfs | |
| parent | 3422b4bc606eee2ba7758ea9347c83332eeec3e3 (diff) | |
| parent | 1b237f190eb3d36f52dffe07a40b5eb210280e00 (diff) | |
Merge tag 'v6.17-rc3' into togreg
Linux 6.17-rc3
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/extent_io.c | 35 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 37 | ||||
| -rw-r--r-- | fs/btrfs/qgroup.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 19 | ||||
| -rw-r--r-- | fs/btrfs/subpage.c | 19 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 13 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 19 | ||||
| -rw-r--r-- | fs/btrfs/zoned.c | 135 | 
8 files changed, 202 insertions, 78 deletions
| diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 835b0deef9bb..c953297aa89a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1512,7 +1512,7 @@ out:  /*   * Return 0 if we have submitted or queued the sector for submission. - * Return <0 for critical errors. + * Return <0 for critical errors, and the sector will have its dirty flag cleared.   *   * Caller should make sure filepos < i_size and handle filepos >= i_size case.   */ @@ -1535,8 +1535,17 @@ static int submit_one_sector(struct btrfs_inode *inode,  	ASSERT(filepos < i_size);  	em = btrfs_get_extent(inode, NULL, filepos, sectorsize); -	if (IS_ERR(em)) +	if (IS_ERR(em)) { +		/* +		 * When submission failed, we should still clear the folio dirty. +		 * Or the folio will be written back again but without any +		 * ordered extent. +		 */ +		btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); +		btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); +		btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);  		return PTR_ERR(em); +	}  	extent_offset = filepos - em->start;  	em_end = btrfs_extent_map_end(em); @@ -1609,8 +1618,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,  		folio_unlock(folio);  		return 1;  	} -	if (ret < 0) +	if (ret < 0) { +		btrfs_folio_clear_dirty(fs_info, folio, start, len); +		btrfs_folio_set_writeback(fs_info, folio, start, len); +		btrfs_folio_clear_writeback(fs_info, folio, start, len);  		return ret; +	}  	for (cur = start; cur < start + len; cur += fs_info->sectorsize)  		set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); @@ -1666,8 +1679,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,  	 * Here we set writeback and clear for the range. If the full folio  	 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.  	 * -	 * If we hit any error, the corresponding sector will still be dirty -	 * thus no need to clear PAGECACHE_TAG_DIRTY. +	 * If we hit any error, the corresponding sector will have its dirty +	 * flag cleared and writeback finished, thus no need to handle the error case.  	 */  	if (!submitted_io && !error) {  		btrfs_folio_set_writeback(fs_info, folio, start, len); @@ -1813,6 +1826,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e  		xas_load(&xas);  		xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);  		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); +		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);  		xas_unlock_irqrestore(&xas, flags);  		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); @@ -4331,15 +4345,18 @@ static int try_release_subpage_extent_buffer(struct folio *folio)  	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;  	int ret; -	xa_lock_irq(&fs_info->buffer_tree); +	rcu_read_lock();  	xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {  		/*  		 * The same as try_release_extent_buffer(), to ensure the eb  		 * won't disappear out from under us.  		 */  		spin_lock(&eb->refs_lock); +		rcu_read_unlock(); +  		if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {  			spin_unlock(&eb->refs_lock); +			rcu_read_lock();  			continue;  		} @@ -4358,11 +4375,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)  		 * check the folio private at the end.  And  		 * release_extent_buffer() will release the refs_lock.  		 */ -		xa_unlock_irq(&fs_info->buffer_tree);  		release_extent_buffer(eb); -		xa_lock_irq(&fs_info->buffer_tree); +		rcu_read_lock();  	} -	xa_unlock_irq(&fs_info->buffer_tree); +	rcu_read_unlock();  	/*  	 * Finally to check if we have cleared folio private, as if we have @@ -4375,7 +4391,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)  		ret = 0;  	spin_unlock(&folio->mapping->i_private_lock);  	return ret; -  }  int try_release_extent_buffer(struct folio *folio) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdb..9e4aec7330cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -401,10 +401,12 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,  	while (index <= end_index) {  		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); -		index++; -		if (IS_ERR(folio)) +		if (IS_ERR(folio)) { +			index++;  			continue; +		} +		index = folio_end(folio) >> PAGE_SHIFT;  		/*  		 * Here we just clear all Ordered bits for every page in the  		 * range, then btrfs_mark_ordered_io_finished() will handle @@ -2013,7 +2015,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio  	 * cleaered by the caller.  	 */  	if (ret < 0) -		btrfs_cleanup_ordered_extents(inode, file_pos, end); +		btrfs_cleanup_ordered_extents(inode, file_pos, len);  	return ret;  } @@ -4187,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,  	return ret;  } +static void update_time_after_link_or_unlink(struct btrfs_inode *dir) +{ +	struct timespec64 now; + +	/* +	 * If we are replaying a log tree, we do not want to update the mtime +	 * and ctime of the parent directory with the current time, since the +	 * log replay procedure is responsible for setting them to their correct +	 * values (the ones it had when the fsync was done). +	 */ +	if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) +		return; + +	now = inode_set_ctime_current(&dir->vfs_inode); +	inode_set_mtime_to_ts(&dir->vfs_inode, now); +} +  /*   * unlink helper that gets used here in inode.c and in the tree logging   * recovery code.  It remove a link in a directory with a given name, and @@ -4287,7 +4306,7 @@ skip_backref:  	inode_inc_iversion(&inode->vfs_inode);  	inode_set_ctime_current(&inode->vfs_inode);  	inode_inc_iversion(&dir->vfs_inode); - 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); +	update_time_after_link_or_unlink(dir);  	return btrfs_update_inode(trans, dir);  } @@ -6681,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +  			   name->len * 2);  	inode_inc_iversion(&parent_inode->vfs_inode); -	/* -	 * If we are replaying a log tree, we do not want to update the mtime -	 * and ctime of the parent directory with the current time, since the -	 * log replay procedure is responsible for setting them to their correct -	 * values (the ones it had when the fsync was done). -	 */ -	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) -		inode_set_mtime_to_ts(&parent_inode->vfs_inode, -				      inode_set_ctime_current(&parent_inode->vfs_inode)); +	update_time_after_link_or_unlink(parent_inode);  	ret = btrfs_update_inode(trans, parent_inode);  	if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1a5972178b3a..ccaa9a3cf1ce 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1453,7 +1453,6 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,  				    struct btrfs_qgroup *src, int sign)  {  	struct btrfs_qgroup *qgroup; -	struct btrfs_qgroup *cur;  	LIST_HEAD(qgroup_list);  	u64 num_bytes = src->excl;  	int ret = 0; @@ -1463,7 +1462,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,  		goto out;  	qgroup_iterator_add(&qgroup_list, qgroup); -	list_for_each_entry(cur, &qgroup_list, iterator) { +	list_for_each_entry(qgroup, &qgroup_list, iterator) {  		struct btrfs_qgroup_list *glist;  		qgroup->rfer += sign * num_bytes; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e58151933844..7256f6748c8f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -602,6 +602,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,  	if (btrfs_root_id(root) == objectid) {  		u64 commit_root_gen; +		/* +		 * Relocation will wait for cleaner thread, and any half-dropped +		 * subvolume will be fully cleaned up at mount time. +		 * So here we shouldn't hit a subvolume with non-zero drop_progress. +		 * +		 * If this isn't the case, error out since it can make us attempt to +		 * drop references for extents that were already dropped before. +		 */ +		if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { +			struct btrfs_key cpu_key; + +			btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); +			btrfs_err(fs_info, +	"cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", +				  objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); +			ret = -EUCLEAN; +			goto fail; +		} +  		/* called by btrfs_init_reloc_root */  		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,  				      BTRFS_TREE_RELOC_OBJECTID); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c9b3821957f7..cb4f97833dc3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,  	spin_lock_irqsave(&bfs->lock, flags);  	bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + +	/* +	 * Don't clear the TOWRITE tag when starting writeback on a still-dirty +	 * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it, +	 * assume writeback is complete, and exit too early — violating sync +	 * ordering guarantees. +	 */  	if (!folio_test_writeback(folio)) -		folio_start_writeback(folio); +		__folio_start_writeback(folio, true); +	if (!folio_test_dirty(folio)) { +		struct address_space *mapping = folio_mapping(folio); +		XA_STATE(xas, &mapping->i_pages, folio->index); +		unsigned long flags; + +		xas_lock_irqsave(&xas, flags); +		xas_load(&xas); +		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); +		xas_unlock_irqrestore(&xas, flags); +	}  	spin_unlock_irqrestore(&bfs->lock, flags);  } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68e35a3700ff..a262b494a89f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -88,6 +88,9 @@ struct btrfs_fs_context {  	refcount_t refs;  }; +static void btrfs_emit_options(struct btrfs_fs_info *info, +			       struct btrfs_fs_context *old); +  enum {  	Opt_acl,  	Opt_clear_cache, @@ -698,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,  	if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {  		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { -			btrfs_info(info, "disk space caching is enabled");  			btrfs_warn(info,  "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");  		} -		if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) -			btrfs_info(info, "using free-space-tree");  	}  	return ret; @@ -980,6 +980,8 @@ static int btrfs_fill_super(struct super_block *sb,  		return ret;  	} +	btrfs_emit_options(fs_info, NULL); +  	inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode); @@ -1437,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,  {  	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");  	btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); -	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); +	btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");  	btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");  	btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");  	btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); @@ -1459,10 +1461,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,  	btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");  	btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); +	btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");  	btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");  	btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");  	btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); -	btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); +	btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");  	btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");  	btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");  	btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2186e87fb61b..69e11557fd13 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2605,14 +2605,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,  /*   * Correctly adjust the reserved bytes occupied by a log tree extent buffer   */ -static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)  {  	struct btrfs_block_group *cache;  	cache = btrfs_lookup_block_group(fs_info, start);  	if (!cache) {  		btrfs_err(fs_info, "unable to find block group for %llu", start); -		return; +		return -ENOENT;  	}  	spin_lock(&cache->space_info->lock); @@ -2623,27 +2623,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)  	spin_unlock(&cache->space_info->lock);  	btrfs_put_block_group(cache); + +	return 0;  }  static int clean_log_buffer(struct btrfs_trans_handle *trans,  			    struct extent_buffer *eb)  { -	int ret; -  	btrfs_tree_lock(eb);  	btrfs_clear_buffer_dirty(trans, eb);  	wait_on_extent_buffer_writeback(eb);  	btrfs_tree_unlock(eb); -	if (trans) { -		ret = btrfs_pin_reserved_extent(trans, eb); -		if (ret) -			return ret; -	} else { -		unaccount_log_buffer(eb->fs_info, eb->start); -	} +	if (trans) +		return btrfs_pin_reserved_extent(trans, eb); -	return 0; +	return unaccount_log_buffer(eb->fs_info, eb->start);  }  static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 245e813ecd78..ea662036f441 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@  #include "accessors.h"  #include "bio.h"  #include "transaction.h" +#include "sysfs.h"  /* Maximum number of zones to report per blkdev_report_zones() call */  #define BTRFS_REPORT_NR_ZONES   4096 @@ -42,6 +43,9 @@  /* Number of superblock log zones */  #define BTRFS_NR_SB_LOG_ZONES 2 +/* Default number of max active zones when the device has no limits. */ +#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES	128 +  /*   * Minimum of active zones we need:   * @@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)  	if (!IS_ALIGNED(nr_sectors, zone_sectors))  		zone_info->nr_zones++; -	max_active_zones = bdev_max_active_zones(bdev); +	max_active_zones = min_not_zero(bdev_max_active_zones(bdev), +					bdev_max_open_zones(bdev)); +	if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) +		max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;  	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {  		btrfs_err(fs_info,  "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -2168,10 +2175,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)  		goto out_unlock;  	} -	/* No space left */ -	if (btrfs_zoned_bg_is_full(block_group)) { -		ret = false; -		goto out_unlock; +	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { +		/* The caller should check if the block group is full. */ +		if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { +			ret = false; +			goto out_unlock; +		} +	} else { +		/* Since it is already written, it should have been active. */ +		WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);  	}  	for (i = 0; i < map->num_stripes; i++) { @@ -2230,7 +2242,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)  	struct btrfs_fs_info *fs_info = block_group->fs_info;  	const u64 end = block_group->start + block_group->length;  	struct extent_buffer *eb; -	unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); +	unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);  	rcu_read_lock();  	xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { @@ -2245,6 +2257,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)  	rcu_read_unlock();  } +static int call_zone_finish(struct btrfs_block_group *block_group, +			    struct btrfs_io_stripe *stripe) +{ +	struct btrfs_device *device = stripe->dev; +	const u64 physical = stripe->physical; +	struct btrfs_zoned_device_info *zinfo = device->zone_info; +	int ret; + +	if (!device->bdev) +		return 0; + +	if (zinfo->max_active_zones == 0) +		return 0; + +	if (btrfs_dev_is_sequential(device, physical)) { +		unsigned int nofs_flags; + +		nofs_flags = memalloc_nofs_save(); +		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, +				       physical >> SECTOR_SHIFT, +				       zinfo->zone_size >> SECTOR_SHIFT); +		memalloc_nofs_restore(nofs_flags); + +		if (ret) +			return ret; +	} + +	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) +		zinfo->reserved_active_zones++; +	btrfs_dev_clear_active_zone(device, physical); + +	return 0; +} +  static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)  {  	struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2329,31 +2375,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ  	down_read(&dev_replace->rwsem);  	map = block_group->physical_map;  	for (i = 0; i < map->num_stripes; i++) { -		struct btrfs_device *device = map->stripes[i].dev; -		const u64 physical = map->stripes[i].physical; -		struct btrfs_zoned_device_info *zinfo = device->zone_info; -		unsigned int nofs_flags; - -		if (!device->bdev) -			continue; - -		if (zinfo->max_active_zones == 0) -			continue; - -		nofs_flags = memalloc_nofs_save(); -		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, -				       physical >> SECTOR_SHIFT, -				       zinfo->zone_size >> SECTOR_SHIFT); -		memalloc_nofs_restore(nofs_flags); +		ret = call_zone_finish(block_group, &map->stripes[i]);  		if (ret) {  			up_read(&dev_replace->rwsem);  			return ret;  		} - -		if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) -			zinfo->reserved_active_zones++; -		btrfs_dev_clear_active_zone(device, physical);  	}  	up_read(&dev_replace->rwsem); @@ -2504,12 +2531,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)  void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)  {  	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; -	struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; +	struct btrfs_space_info *space_info = data_sinfo;  	struct btrfs_trans_handle *trans;  	struct btrfs_block_group *bg;  	struct list_head *bg_list;  	u64 alloc_flags; -	bool initial = false; +	bool first = true;  	bool did_chunk_alloc = false;  	int index;  	int ret; @@ -2523,21 +2550,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)  	if (sb_rdonly(fs_info->sb))  		return; -	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);  	alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);  	index = btrfs_bg_flags_to_raid_index(alloc_flags); -	bg_list = &data_sinfo->block_groups[index]; +	/* Scan the data space_info to find empty block groups. Take the second one. */  again: +	bg_list = &space_info->block_groups[index];  	list_for_each_entry(bg, bg_list, list) { -		if (bg->used > 0) +		if (bg->alloc_offset != 0)  			continue; -		if (!initial) { -			initial = true; +		if (first) { +			first = false;  			continue;  		} +		if (space_info == data_sinfo) { +			/* Migrate the block group to the data relocation space_info. */ +			struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; +			int factor; + +			ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); +			factor = btrfs_bg_type_to_factor(bg->flags); + +			down_write(&space_info->groups_sem); +			list_del_init(&bg->list); +			/* We can assume this as we choose the second empty one. */ +			ASSERT(!list_empty(&space_info->block_groups[index])); +			up_write(&space_info->groups_sem); + +			spin_lock(&space_info->lock); +			space_info->total_bytes -= bg->length; +			space_info->disk_total -= bg->length * factor; +			/* There is no allocation ever happened. */ +			ASSERT(bg->used == 0); +			ASSERT(bg->zone_unusable == 0); +			/* No super block in a block group on the zoned setup. */ +			ASSERT(bg->bytes_super == 0); +			spin_unlock(&space_info->lock); + +			bg->space_info = reloc_sinfo; +			if (reloc_sinfo->block_group_kobjs[index] == NULL) +				btrfs_sysfs_add_block_group_type(bg); + +			btrfs_add_bg_to_space_info(fs_info, bg); +		} +  		fs_info->data_reloc_bg = bg->start;  		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);  		btrfs_zone_activate(bg); @@ -2552,11 +2610,18 @@ again:  	if (IS_ERR(trans))  		return; +	/* Allocate new BG in the data relocation space_info. */ +	space_info = data_sinfo->sub_group[0]; +	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);  	ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);  	btrfs_end_transaction(trans);  	if (ret == 1) { +		/* +		 * We allocated a new block group in the data relocation space_info. We +		 * can take that one. +		 */ +		first = false;  		did_chunk_alloc = true; -		bg_list = &space_info->block_groups[index];  		goto again;  	}  } @@ -2650,7 +2715,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)  		spin_lock(&block_group->lock);  		if (block_group->reserved || block_group->alloc_offset == 0 || -		    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || +		    !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||  		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {  			spin_unlock(&block_group->lock);  			continue; | 
