diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
| -rw-r--r-- | fs/btrfs/volumes.c | 242 | 
1 files changed, 140 insertions, 102 deletions
| diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3d0ac8bdb21f..0a0776489055 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,8 +13,8 @@  #include <linux/list_sort.h>  #include <linux/namei.h>  #include "misc.h" -#include "ctree.h"  #include "disk-io.h" +#include "extent-tree.h"  #include "transaction.h"  #include "volumes.h"  #include "raid56.h" @@ -48,6 +48,7 @@ struct btrfs_io_geometry {  	u64 raid56_full_stripe_start;  	int max_errors;  	enum btrfs_map_op op; +	bool use_rst;  };  const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -1302,6 +1303,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  	struct btrfs_device *device;  	struct btrfs_device *latest_dev = NULL;  	struct btrfs_device *tmp_device; +	s64 __maybe_unused value = 0;  	int ret = 0;  	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1331,7 +1333,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  	fs_devices->latest_dev = latest_dev;  	fs_devices->total_rw_bytes = 0;  	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +	fs_devices->read_devid = latest_dev->devid; +	fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), +							    &value); +	if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) +		fs_devices->collect_fs_stats = true; + +	if (value) { +		if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) +			fs_devices->rr_min_contig_read = value; +		if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) +			fs_devices->read_devid = value; +	} +#else  	fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif  	return 0;  } @@ -2049,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,  	ptr = btrfs_device_fsid(dev_item);  	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,  			    ptr, BTRFS_FSID_SIZE); -	btrfs_mark_buffer_dirty(trans, leaf);  	ret = 0;  out: @@ -2745,11 +2762,9 @@ next_slot:  		device = btrfs_find_device(fs_info->fs_devices, &args);  		BUG_ON(!device); /* Logic error */ -		if (device->fs_devices->seeding) { +		if (device->fs_devices->seeding)  			btrfs_set_device_generation(leaf, dev_item,  						    device->generation); -			btrfs_mark_buffer_dirty(trans, leaf); -		}  		path->slots[0]++;  		goto next_slot; @@ -3042,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,  				     btrfs_device_get_disk_total_bytes(device));  	btrfs_set_device_bytes_used(leaf, dev_item,  				    btrfs_device_get_bytes_used(device)); -	btrfs_mark_buffer_dirty(trans, leaf); -  out:  	btrfs_free_path(path);  	return ret; @@ -3752,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,  	btrfs_set_balance_meta(leaf, item, &disk_bargs);  	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);  	btrfs_set_balance_sys(leaf, item, &disk_bargs); -  	btrfs_set_balance_flags(leaf, item, bctl->flags); - -	btrfs_mark_buffer_dirty(trans, leaf);  out:  	btrfs_free_path(path);  	err = btrfs_commit_transaction(trans); @@ -5517,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma  	btrfs_free_chunk_map(map);  } +static int btrfs_chunk_map_cmp(const struct rb_node *new, +			       const struct rb_node *exist) +{ +	const struct btrfs_chunk_map *new_map = +		rb_entry(new, struct btrfs_chunk_map, rb_node); +	const struct btrfs_chunk_map *exist_map = +		rb_entry(exist, struct btrfs_chunk_map, rb_node); + +	if (new_map->start == exist_map->start) +		return 0; +	if (new_map->start < exist_map->start) +		return -1; +	return 1; +} +  EXPORT_FOR_TESTS  int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)  { -	struct rb_node **p; -	struct rb_node *parent = NULL; -	bool leftmost = true; +	struct rb_node *exist;  	write_lock(&fs_info->mapping_tree_lock); -	p = &fs_info->mapping_tree.rb_root.rb_node; -	while (*p) { -		struct btrfs_chunk_map *entry; - -		parent = *p; -		entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - -		if (map->start < entry->start) { -			p = &(*p)->rb_left; -		} else if (map->start > entry->start) { -			p = &(*p)->rb_right; -			leftmost = false; -		} else { -			write_unlock(&fs_info->mapping_tree_lock); -			return -EEXIST; -		} +	exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, +				   btrfs_chunk_map_cmp); + +	if (exist) { +		write_unlock(&fs_info->mapping_tree_lock); +		return -EEXIST;  	} -	rb_link_node(&map->rb_node, parent, p); -	rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);  	chunk_map_device_set_bits(map, CHUNK_ALLOCATED);  	chunk_map_device_clear_bits(map, CHUNK_TRIMMED);  	write_unlock(&fs_info->mapping_tree_lock); @@ -5963,6 +5974,76 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,  	return len;  } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ +	for (int index = first; index < first + num_stripes; index++) { +		const struct btrfs_device *device = map->stripes[index].dev; + +		if (device->devid == READ_ONCE(device->fs_devices->read_devid)) +			return index; +	} + +	/* If no read-preferred device is set use the first stripe. */ +	return first; +} + +struct stripe_mirror { +	u64 devid; +	int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ +	const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; +	const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + +	if (s1->devid < s2->devid) +		return -1; +	if (s1->devid > s2->devid) +		return 1; +	return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + *  1. Compute the read cycle as the total sectors read divided by the minimum + *     sectors per device. + *  2. Determine the stripe number for the current read by taking the modulus + *     of the read cycle with the total number of stripes: + * + *      stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ +	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; +	struct btrfs_device *device  = map->stripes[first].dev; +	struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; +	unsigned int read_cycle; +	unsigned int total_reads; +	unsigned int min_reads_per_dev; + +	total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); +	min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> +						       fs_info->sectorsize_bits; + +	for (int index = 0, i = first; i < first + num_stripes; i++) { +		stripes[index].devid = map->stripes[i].dev->devid; +		stripes[index].num = i; +		index++; +	} +	sort(stripes, num_stripes, sizeof(struct stripe_mirror), +	     btrfs_cmp_devid, NULL); + +	read_cycle = total_reads / min_reads_per_dev; +	return stripes[read_cycle % num_stripes].num; +} +#endif +  static int find_live_mirror(struct btrfs_fs_info *fs_info,  			    struct btrfs_chunk_map *map, int first,  			    int dev_replace_is_ongoing) @@ -5992,6 +6073,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,  	case BTRFS_READ_POLICY_PID:  		preferred_mirror = first + (current->pid % num_stripes);  		break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL +	case BTRFS_READ_POLICY_RR: +		preferred_mirror = btrfs_read_rr(map, first, num_stripes); +		break; +	case BTRFS_READ_POLICY_DEVID: +		preferred_mirror = btrfs_read_preferred(map, first, num_stripes); +		break; +#endif  	}  	if (dev_replace_is_ongoing && @@ -6350,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,  {  	dst->dev = map->stripes[io_geom->stripe_index].dev; -	if (io_geom->op == BTRFS_MAP_READ && -	    btrfs_need_stripe_tree_update(fs_info, map->type)) +	if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)  		return btrfs_get_raid_extent_offset(fs_info, logical, length,  						    map->type,  						    io_geom->stripe_index, dst); @@ -6366,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,  				const struct btrfs_io_stripe *smap,  				const struct btrfs_chunk_map *map,  				int num_alloc_stripes, -				enum btrfs_map_op op, int mirror_num) +				struct btrfs_io_geometry *io_geom)  {  	if (!smap)  		return false; @@ -6374,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,  	if (num_alloc_stripes != 1)  		return false; -	if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) +	if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)  		return false; -	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) +	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)  		return false;  	return true; @@ -6583,6 +6671,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	io_geom.raid56_full_stripe_start = (u64)-1;  	max_len = btrfs_max_io_len(map, map_offset, &io_geom);  	*length = min_t(u64, map->chunk_len - map_offset, max_len); +	io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);  	if (dev_replace->replace_task != current)  		down_read(&dev_replace->rwsem); @@ -6651,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	 * physical block information on the stack instead of allocating an  	 * I/O context structure.  	 */ -	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, -				io_geom.mirror_num)) { +	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {  		ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);  		if (mirror_num_ret)  			*mirror_num_ret = io_geom.mirror_num; @@ -6666,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  		goto out;  	}  	bioc->map_type = map->type; +	bioc->use_rst = io_geom.use_rst;  	/*  	 * For RAID56 full map, we need to make sure the stripes[] follows the @@ -7006,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,  	warn_32bit_meta_chunk(fs_info, logical, length, type);  #endif -	/* -	 * Only need to verify chunk item if we're reading from sys chunk array, -	 * as chunk item in tree block is already verified by tree-checker. -	 */ -	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { -		ret = btrfs_check_chunk_valid(leaf, chunk, logical); -		if (ret) -			return ret; -	} -  	map = btrfs_find_chunk_map(fs_info, logical, 1);  	/* already mapped? */ @@ -7273,16 +7352,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  {  	struct btrfs_super_block *super_copy = fs_info->super_copy;  	struct extent_buffer *sb; -	struct btrfs_disk_key *disk_key; -	struct btrfs_chunk *chunk;  	u8 *array_ptr;  	unsigned long sb_array_offset;  	int ret = 0; -	u32 num_stripes;  	u32 array_size; -	u32 len = 0;  	u32 cur_offset; -	u64 type;  	struct btrfs_key key;  	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7305,10 +7379,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  	cur_offset = 0;  	while (cur_offset < array_size) { -		disk_key = (struct btrfs_disk_key *)array_ptr; -		len = sizeof(*disk_key); -		if (cur_offset + len > array_size) -			goto out_short_read; +		struct btrfs_chunk *chunk; +		struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; +		u32 len = sizeof(*disk_key); + +		/* +		 * The sys_chunk_array has been already verified at super block +		 * read time.  Only do ASSERT()s for basic checks. +		 */ +		ASSERT(cur_offset + len <= array_size);  		btrfs_disk_key_to_cpu(&key, disk_key); @@ -7316,44 +7395,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  		sb_array_offset += len;  		cur_offset += len; -		if (key.type != BTRFS_CHUNK_ITEM_KEY) { -			btrfs_err(fs_info, -			    "unexpected item type %u in sys_array at offset %u", -				  (u32)key.type, cur_offset); -			ret = -EIO; -			break; -		} +		ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);  		chunk = (struct btrfs_chunk *)sb_array_offset; -		/* -		 * At least one btrfs_chunk with one stripe must be present, -		 * exact stripe count check comes afterwards -		 */ -		len = btrfs_chunk_item_size(1); -		if (cur_offset + len > array_size) -			goto out_short_read; - -		num_stripes = btrfs_chunk_num_stripes(sb, chunk); -		if (!num_stripes) { -			btrfs_err(fs_info, -			"invalid number of stripes %u in sys_array at offset %u", -				  num_stripes, cur_offset); -			ret = -EIO; -			break; -		} +		ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); -		type = btrfs_chunk_type(sb, chunk); -		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { -			btrfs_err(fs_info, -			"invalid chunk type %llu in sys_array at offset %u", -				  type, cur_offset); -			ret = -EIO; -			break; -		} +		len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); -		len = btrfs_chunk_item_size(num_stripes); -		if (cur_offset + len > array_size) -			goto out_short_read; +		ASSERT(cur_offset + len <= array_size);  		ret = read_one_chunk(&key, sb, chunk);  		if (ret) @@ -7366,13 +7415,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)  	clear_extent_buffer_uptodate(sb);  	free_extent_buffer_stale(sb);  	return ret; - -out_short_read: -	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", -			len, cur_offset); -	clear_extent_buffer_uptodate(sb); -	free_extent_buffer_stale(sb); -	return -EIO;  }  /* @@ -7572,8 +7614,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)  	struct btrfs_device *device;  	int ret = 0; -	fs_devices->fs_info = fs_info; -  	mutex_lock(&fs_devices->device_list_mutex);  	list_for_each_entry(device, &fs_devices->devices, dev_list)  		device->fs_info = fs_info; @@ -7749,8 +7789,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,  	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)  		btrfs_set_dev_stats_value(eb, ptr, i,  					  btrfs_dev_stat_read(device, i)); -	btrfs_mark_buffer_dirty(trans, eb); -  out:  	btrfs_free_path(path);  	return ret; | 
