diff options
Diffstat (limited to 'fs/btrfs')
46 files changed, 3719 insertions, 1320 deletions
| diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f341a98031d2..6d1d0b93b1aa 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o  btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \  	tests/extent-buffer-tests.o tests/btrfs-tests.o \ -	tests/extent-io-tests.o tests/inode-tests.o +	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index ff9b3995d453..9a0124a95851 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,  	const char *name;  	char *value = NULL; -	if (acl) { -		ret = posix_acl_valid(acl); -		if (ret < 0) -			return ret; -		ret = 0; -	} -  	switch (type) {  	case ACL_TYPE_ACCESS:  		name = POSIX_ACL_XATTR_ACCESS; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 10db21fa0926..e25564bfcb46 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -900,7 +900,11 @@ again:  		goto out;  	BUG_ON(ret == 0); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else  	if (trans) { +#endif  		/*  		 * look if there are updates for this ref queued and lock the  		 * head @@ -984,11 +988,12 @@ again:  				goto out;  		}  		if (ref->count && ref->parent) { -			if (extent_item_pos && !ref->inode_list) { +			if (extent_item_pos && !ref->inode_list && +			    ref->level == 0) {  				u32 bsz;  				struct extent_buffer *eb;  				bsz = btrfs_level_size(fs_info->extent_root, -							info_level); +							ref->level);  				eb = read_tree_block(fs_info->extent_root,  							   ref->parent, bsz, 0);  				if (!eb || !extent_buffer_uptodate(eb)) { @@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,   * returns <0 on error   */  static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, -				struct btrfs_extent_item *ei, u32 item_size, -				struct btrfs_extent_inline_ref **out_eiref, -				int *out_type) +				   struct btrfs_key *key, +				   struct btrfs_extent_item *ei, u32 item_size, +				   struct btrfs_extent_inline_ref **out_eiref, +				   int *out_type)  {  	unsigned long end;  	u64 flags; @@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,  		/* first call */  		flags = btrfs_extent_flags(eb, ei);  		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { -			info = (struct btrfs_tree_block_info *)(ei + 1); -			*out_eiref = -				(struct btrfs_extent_inline_ref *)(info + 1); +			if (key->type == BTRFS_METADATA_ITEM_KEY) { +				/* a skinny metadata extent */ +				*out_eiref = +				     (struct btrfs_extent_inline_ref *)(ei + 1); +			} else { +				WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); +				info = (struct btrfs_tree_block_info *)(ei + 1); +				*out_eiref = +				   (struct btrfs_extent_inline_ref *)(info + 1); +			}  		} else {  			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);  		}  		*ptr = (unsigned long)*out_eiref; -		if ((void *)*ptr >= (void *)ei + item_size) +		if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)  			return -ENOENT;  	}  	end = (unsigned long)ei + item_size; -	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr; +	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);  	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);  	*ptr += btrfs_extent_inline_ref_size(*out_type); @@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,   * <0 on error.   */  int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, -				struct btrfs_extent_item *ei, u32 item_size, -				u64 *out_root, u8 *out_level) +			    struct btrfs_key *key, struct btrfs_extent_item *ei, +			    u32 item_size, u64 *out_root, u8 *out_level)  {  	int ret;  	int type; @@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,  		return 1;  	while (1) { -		ret = __get_extent_inline_ref(ptr, eb, ei, item_size, -						&eiref, &type); +		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, +					      &eiref, &type);  		if (ret < 0)  			return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a910b27a8ad9..86fc20fec282 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,  			u64 *flags);  int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, -				struct btrfs_extent_item *ei, u32 item_size, -				u64 *out_root, u8 *out_level); +			    struct btrfs_key *key, struct btrfs_extent_item *ei, +			    u32 item_size, u64 *out_root, u8 *out_level);  int iterate_extent_inodes(struct btrfs_fs_info *fs_info,  				u64 extent_item_objectid, @@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,  int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);  int btrfs_find_all_roots(struct btrfs_trans_handle *trans, -				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 time_seq, struct ulist **roots); +			 struct btrfs_fs_info *fs_info, u64 bytenr, +			 u64 time_seq, struct ulist **roots);  char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  			u32 name_len, unsigned long name_off,  			struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9a..4794923c410c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)  static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)  { -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,  		  &BTRFS_I(inode)->runtime_flags);  } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); +  #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0e8388e72d8d..ce92ae30250f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error:  					next_stack =  					    btrfsic_stack_frame_alloc();  					if (NULL == next_stack) { +						sf->error = -1;  						btrfsic_release_block_ctx(  								&sf->  								next_block_ctx); @@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame:  				    sf->next_block_ctx.datav[0];  				next_stack = btrfsic_stack_frame_alloc(); -				if (NULL == next_stack) +				if (NULL == next_stack) { +					sf->error = -1;  					goto one_stack_frame_backwards; +				}  				next_stack->i = -1;  				next_stack->block = sf->next_block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d43c544d3b68..92371c414228 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,  	workspace = find_workspace(type);  	if (IS_ERR(workspace)) -		return -1; +		return PTR_ERR(workspace);  	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,  						      start, len, pages, @@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,  	workspace = find_workspace(type);  	if (IS_ERR(workspace)) -		return -ENOMEM; +		return PTR_ERR(workspace);  	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,  							 disk_start, @@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,  	workspace = find_workspace(type);  	if (IS_ERR(workspace)) -		return -ENOMEM; +		return PTR_ERR(workspace);  	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,  						  dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1bcfcdb23cf4..aeab453b8e24 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)  static void add_root_to_dirty_list(struct btrfs_root *root)  {  	spin_lock(&root->fs_info->trans_lock); -	if (root->track_dirty && list_empty(&root->dirty_list)) { +	if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && +	    list_empty(&root->dirty_list)) {  		list_add(&root->dirty_list,  			 &root->fs_info->dirty_cowonly_roots);  	} @@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	int level;  	struct btrfs_disk_key disk_key; -	WARN_ON(root->ref_cows && trans->transid != -		root->fs_info->running_transaction->transid); -	WARN_ON(root->ref_cows && trans->transid != root->last_trans); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->fs_info->running_transaction->transid); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->last_trans);  	level = btrfs_header_level(buf);  	if (level == 0) @@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)  }  /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ -	u64 seq = atomic64_read(&fs_info->tree_mod_seq); -	seq &= 0xffffffff00000000ull; -	seq += 1ull << 32; -	atomic64_set(&fs_info->tree_mod_seq, seq); -	return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. + * Pull a new tree mod seq number for our operation.   */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)  {  	return atomic64_inc_return(&fs_info->tree_mod_seq);  }  /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ -	return (seq & 0xffffffff00000000ull) - 1ull; -} - -/*   * This adds a new blocker to the tree mod log's blocker list if the @elem   * passed does not already have a sequence number set. So when a caller expects   * to record tree modifications, it should ensure to set elem->seq to zero @@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)  u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,  			   struct seq_list *elem)  { -	u64 seq; -  	tree_mod_log_write_lock(fs_info);  	spin_lock(&fs_info->tree_mod_seq_lock);  	if (!elem->seq) { -		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); +		elem->seq = btrfs_inc_tree_mod_seq(fs_info);  		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);  	} -	seq = btrfs_inc_tree_mod_seq_minor(fs_info);  	spin_unlock(&fs_info->tree_mod_seq_lock);  	tree_mod_log_write_unlock(fs_info); -	return seq; +	return elem->seq;  }  void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)  	BUG_ON(!tm); -	spin_lock(&fs_info->tree_mod_seq_lock); -	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); -	spin_unlock(&fs_info->tree_mod_seq_lock); +	tm->seq = btrfs_inc_tree_mod_seq(fs_info);  	tm_root = &fs_info->tree_mod_log;  	new = &tm_root->rb_node; @@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,  	 * snapshot and the block was not allocated by tree relocation,  	 * we know the block is not shared.  	 */ -	if (root->ref_cows && +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&  	    buf != root->node && buf != root->commit_root &&  	    (btrfs_header_generation(buf) <=  	     btrfs_root_last_snapshot(&root->root_item) ||  	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))  		return 1;  #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -	if (root->ref_cows && +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&  	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)  		return 1;  #endif @@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	btrfs_assert_tree_locked(buf); -	WARN_ON(root->ref_cows && trans->transid != -		root->fs_info->running_transaction->transid); -	WARN_ON(root->ref_cows && trans->transid != root->last_trans); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->fs_info->running_transaction->transid); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->last_trans);  	level = btrfs_header_level(buf); @@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  		return ret;  	} -	if (root->ref_cows) { +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {  		ret = btrfs_reloc_cow_block(trans, root, buf, cow);  		if (ret)  			return ret; @@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf)  { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	/* ensure we can see the force_cow */  	smp_rmb(); @@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&  	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&  	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && -	    !root->force_cow) +	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))  		return 0;  	return 1;  } @@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  		return ret;  	btrfs_item_key(path->nodes[0], &found_key, 0);  	ret = comp_keys(&found_key, &key); -	if (ret < 0) +	/* +	 * We might have had an item with the previous key in the tree right +	 * before we released our path. And after we released our path, that +	 * item might have been pushed to the first slot (0) of the leaf we +	 * were holding due to a tree balance. Alternatively, an item with the +	 * previous key can exist as the only element of a leaf (big fat item). +	 * Therefore account for these 2 cases, so that our callers (like +	 * btrfs_previous_item) don't miss an existing item with a key matching +	 * the previous key we computed above. +	 */ +	if (ret <= 0)  		return 0;  	return 1;  } @@ -5736,6 +5718,24 @@ again:  		ret = 0;  		goto done;  	} +	/* +	 * So the above check misses one case: +	 * - after releasing the path above, someone has removed the item that +	 *   used to be at the very end of the block, and balance between leafs +	 *   gets another one with bigger key.offset to replace it. +	 * +	 * This one should be returned as well, or we can get leaf corruption +	 * later(esp. in __btrfs_drop_extents()). +	 * +	 * And a bit more explanation about this check, +	 * with ret > 0, the key isn't found, the path points to the slot +	 * where it should be inserted, so the path->slots[0] item must be the +	 * bigger one. +	 */ +	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { +		ret = 0; +		goto done; +	}  	while (level < BTRFS_MAX_LEVEL) {  		if (!path->nodes[level]) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ba6b88528dc7..b7e2c1c1ef36 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@  #include <asm/kmap_types.h>  #include <linux/pagemap.h>  #include <linux/btrfs.h> +#include <linux/workqueue.h>  #include "extent_io.h"  #include "extent_map.h"  #include "async-thread.h" @@ -756,6 +757,12 @@ struct btrfs_dir_item {  #define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0) +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD		(1ULL << 48) +  struct btrfs_root_item {  	struct btrfs_inode_item inode;  	__le64 generation; @@ -840,7 +847,10 @@ struct btrfs_disk_balance_args {  	/* BTRFS_BALANCE_ARGS_* */  	__le64 flags; -	__le64 unused[8]; +	/* BTRFS_BALANCE_ARGS_LIMIT value */ +	__le64 limit; + +	__le64 unused[7];  } __attribute__ ((__packed__));  /* @@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item {  	__le64 rsv_excl;  } __attribute__ ((__packed__)); +/* For raid type sysfs entries */ +struct raid_kobject { +	int raid_type; +	struct kobject kobj; +}; +  struct btrfs_space_info {  	spinlock_t lock; @@ -1163,7 +1179,7 @@ struct btrfs_space_info {  	wait_queue_head_t wait;  	struct kobject kobj; -	struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; +	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];  };  #define	BTRFS_BLOCK_RSV_GLOBAL		1 @@ -1313,6 +1329,8 @@ struct btrfs_stripe_hash_table {  #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +void btrfs_init_async_reclaim_work(struct work_struct *work); +  /* fs_info */  struct reloc_control;  struct btrfs_device; @@ -1534,6 +1552,9 @@ struct btrfs_fs_info {  	 */  	struct btrfs_workqueue *fixup_workers;  	struct btrfs_workqueue *delayed_workers; + +	/* the extent workers do delayed refs on the extent allocation tree */ +	struct btrfs_workqueue *extent_workers;  	struct task_struct *transaction_kthread;  	struct task_struct *cleaner_kthread;  	int thread_pool_size; @@ -1636,7 +1657,10 @@ struct btrfs_fs_info {  	/* holds configuration and tracking. Protected by qgroup_lock */  	struct rb_root qgroup_tree; +	struct rb_root qgroup_op_tree;  	spinlock_t qgroup_lock; +	spinlock_t qgroup_op_lock; +	atomic_t qgroup_op_seq;  	/*  	 * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1688,6 +1712,9 @@ struct btrfs_fs_info {  	struct semaphore uuid_tree_rescan_sem;  	unsigned int update_uuid_tree_gen:1; + +	/* Used to reclaim the metadata space in the background. */ +	struct work_struct async_reclaim_work;  };  struct btrfs_subvolume_writers { @@ -1696,6 +1723,26 @@ struct btrfs_subvolume_writers {  };  /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code.   But the + * race is very small, and only the first time the root + * is added to each transaction.  So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP	0 +#define BTRFS_ROOT_REF_COWS		1 +#define BTRFS_ROOT_TRACK_DIRTY		2 +#define BTRFS_ROOT_IN_RADIX		3 +#define BTRFS_ROOT_DUMMY_ROOT		4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED	5 +#define BTRFS_ROOT_DEFRAG_RUNNING	6 +#define BTRFS_ROOT_FORCE_COW		7 +#define BTRFS_ROOT_MULTI_LOG_TASKS	8 + +/*   * in ram representation of the tree.  extent_root is used for all allocations   * and for the extent tree extent_root root.   */ @@ -1706,6 +1753,7 @@ struct btrfs_root {  	struct btrfs_root *log_root;  	struct btrfs_root *reloc_root; +	unsigned long state;  	struct btrfs_root_item root_item;  	struct btrfs_key root_key;  	struct btrfs_fs_info *fs_info; @@ -1740,7 +1788,6 @@ struct btrfs_root {  	/* Just be updated when the commit succeeds. */  	int last_log_commit;  	pid_t log_start_pid; -	bool log_multiple_pids;  	u64 objectid;  	u64 last_trans; @@ -1760,23 +1807,13 @@ struct btrfs_root {  	u64 highest_objectid; -	/* btrfs_record_root_in_trans is a multi-step process, -	 * and it can race with the balancing code.   But the -	 * race is very small, and only the first time the root -	 * is added to each transaction.  So in_trans_setup -	 * is used to tell us when more checks are required -	 */ -	unsigned long in_trans_setup; -	int ref_cows; -	int track_dirty; -	int in_radix;  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -	int dummy_root; +	u64 alloc_bytenr;  #endif +  	u64 defrag_trans_start;  	struct btrfs_key defrag_progress;  	struct btrfs_key defrag_max; -	int defrag_running;  	char *name;  	/* the dirty list is only used by non-reference counted roots */ @@ -1790,7 +1827,6 @@ struct btrfs_root {  	spinlock_t orphan_lock;  	atomic_t orphan_inodes;  	struct btrfs_block_rsv *orphan_block_rsv; -	int orphan_item_inserted;  	int orphan_cleanup_state;  	spinlock_t inode_lock; @@ -1808,8 +1844,6 @@ struct btrfs_root {  	 */  	dev_t anon_dev; -	int force_cow; -  	spinlock_t root_item_lock;  	atomic_t refs; @@ -2788,6 +2822,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)  	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;  } +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ +	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} +  /* struct btrfs_root_backup */  BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,  		   tree_root, 64); @@ -2897,6 +2936,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,  	cpu->vend = le64_to_cpu(disk->vend);  	cpu->target = le64_to_cpu(disk->target);  	cpu->flags = le64_to_cpu(disk->flags); +	cpu->limit = le64_to_cpu(disk->limit);  }  static inline void @@ -2914,6 +2954,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,  	disk->vend = cpu_to_le64(cpu->vend);  	disk->target = cpu_to_le64(cpu->target);  	disk->flags = cpu_to_le64(cpu->flags); +	disk->limit = cpu_to_le64(cpu->limit);  }  /* struct btrfs_super_block */ @@ -3236,6 +3277,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, +				 unsigned long count, int wait);  int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root, u64 bytenr, @@ -3275,9 +3318,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,  			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,  			 struct btrfs_key *ins, int is_data);  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow); +		  struct extent_buffer *buf, int full_backref, int no_quota);  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow); +		  struct extent_buffer *buf, int full_backref, int no_quota);  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  				struct btrfs_root *root,  				u64 bytenr, u64 num_bytes, u64 flags, @@ -3285,7 +3328,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  int btrfs_free_extent(struct btrfs_trans_handle *trans,  		      struct btrfs_root *root,  		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, -		      u64 owner, u64 offset, int for_cow); +		      u64 owner, u64 offset, int no_quota);  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);  int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -3297,7 +3340,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 u64 bytenr, u64 num_bytes, u64 parent, -			 u64 root_objectid, u64 owner, u64 offset, int for_cow); +			 u64 root_objectid, u64 owner, u64 offset, int no_quota);  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,  				    struct btrfs_root *root); @@ -3385,7 +3428,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,  					 struct btrfs_fs_info *fs_info);  int __get_raid_index(u64 flags); -  int btrfs_start_nocow_write(struct btrfs_root *root);  void btrfs_end_nocow_write(struct btrfs_root *root);  /* ctree.c */ @@ -3561,7 +3603,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,  			   struct seq_list *elem);  void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,  			    struct seq_list *elem); -u64 btrfs_tree_mod_seq_prev(u64 seq);  int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);  /* root-item.c */ @@ -3708,6 +3749,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,  		       struct bio *bio, u64 file_start, int contig);  int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,  			     struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, +				     const struct btrfs_path *path, +				     struct btrfs_file_extent_item *fi, +				     const bool new_inline, +				     struct extent_map *em); +  /* inode.c */  struct btrfs_delalloc_work {  	struct inode *inode; @@ -4069,52 +4116,6 @@ void btrfs_reada_detach(void *handle);  int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,  			 u64 start, int err); -/* qgroup.c */ -struct qgroup_update { -	struct list_head list; -	struct btrfs_delayed_ref_node *node; -	struct btrfs_delayed_extent_op *extent_op; -}; - -int btrfs_quota_enable(struct btrfs_trans_handle *trans, -		       struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, -			struct btrfs_fs_info *fs_info); -int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); -void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, -			      struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, -			      struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, -			struct btrfs_fs_info *fs_info, u64 qgroupid, -			char *name); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, -			      struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, -		       struct btrfs_fs_info *fs_info, u64 qgroupid, -		       struct btrfs_qgroup_limit *limit); -int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); -void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, -			    struct btrfs_delayed_ref_node *node, -			    struct btrfs_delayed_extent_op *extent_op); -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, -			     struct btrfs_fs_info *fs_info, -			     struct btrfs_delayed_ref_node *node, -			     struct btrfs_delayed_extent_op *extent_op); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, -		      struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, -			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, -			 struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); -  static inline int is_fstree(u64 rootid)  {  	if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -4131,6 +4132,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)  /* Sanity test specific functions */  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +			       u64 rfer, u64 excl);  #endif  #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 33e561a84013..da775bfdebc9 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -149,8 +149,8 @@ again:  	spin_lock(&root->inode_lock);  	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);  	if (ret == -EEXIST) { -		kmem_cache_free(delayed_node_cache, node);  		spin_unlock(&root->inode_lock); +		kmem_cache_free(delayed_node_cache, node);  		radix_tree_preload_end();  		goto again;  	} @@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(  	mutex_unlock(&delayed_node->mutex);  	if (atomic_dec_and_test(&delayed_node->refs)) { +		bool free = false;  		struct btrfs_root *root = delayed_node->root;  		spin_lock(&root->inode_lock);  		if (atomic_read(&delayed_node->refs) == 0) {  			radix_tree_delete(&root->delayed_nodes_tree,  					  delayed_node->inode_id); -			kmem_cache_free(delayed_node_cache, delayed_node); +			free = true;  		}  		spin_unlock(&root->inode_lock); +		if (free) +			kmem_cache_free(delayed_node_cache, delayed_node);  	}  } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 31299646024d..6d16bea94e1c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,  		return -1;  	if (ref1->type > ref2->type)  		return 1; +	if (ref1->no_quota > ref2->no_quota) +		return 1; +	if (ref1->no_quota < ref2->no_quota) +		return -1;  	/* merging of sequenced refs is not allowed */  	if (compare_seq) {  		if (ref1->seq < ref2->seq) @@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  		     struct btrfs_delayed_ref_head *head_ref,  		     struct btrfs_delayed_ref_node *ref, u64 bytenr,  		     u64 num_bytes, u64 parent, u64 ref_root, int level, -		     int action, int for_cow) +		     int action, int no_quota)  {  	struct btrfs_delayed_ref_node *existing;  	struct btrfs_delayed_tree_ref *full_ref; @@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	if (action == BTRFS_ADD_DELAYED_EXTENT)  		action = BTRFS_ADD_DELAYED_REF; +	if (is_fstree(ref_root)) +		seq = atomic64_read(&fs_info->tree_mod_seq);  	delayed_refs = &trans->transaction->delayed_refs;  	/* first set the basic ref node struct up */ @@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	ref->action = action;  	ref->is_head = 0;  	ref->in_tree = 1; - -	if (need_ref_seq(for_cow, ref_root)) -		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); +	ref->no_quota = no_quota;  	ref->seq = seq;  	full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,  		     struct btrfs_delayed_ref_head *head_ref,  		     struct btrfs_delayed_ref_node *ref, u64 bytenr,  		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner, -		     u64 offset, int action, int for_cow) +		     u64 offset, int action, int no_quota)  {  	struct btrfs_delayed_ref_node *existing;  	struct btrfs_delayed_data_ref *full_ref; @@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	delayed_refs = &trans->transaction->delayed_refs; +	if (is_fstree(ref_root)) +		seq = atomic64_read(&fs_info->tree_mod_seq); +  	/* first set the basic ref node struct up */  	atomic_set(&ref->refs, 1);  	ref->bytenr = bytenr; @@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	ref->action = action;  	ref->is_head = 0;  	ref->in_tree = 1; - -	if (need_ref_seq(for_cow, ref_root)) -		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); +	ref->no_quota = no_quota;  	ref->seq = seq;  	full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  			       u64 bytenr, u64 num_bytes, u64 parent,  			       u64 ref_root,  int level, int action,  			       struct btrfs_delayed_extent_op *extent_op, -			       int for_cow) +			       int no_quota)  {  	struct btrfs_delayed_tree_ref *ref;  	struct btrfs_delayed_ref_head *head_ref;  	struct btrfs_delayed_ref_root *delayed_refs; +	if (!is_fstree(ref_root) || !fs_info->quota_enabled) +		no_quota = 0; +  	BUG_ON(extent_op && extent_op->is_data);  	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);  	if (!ref) @@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,  				   num_bytes, parent, ref_root, level, action, -				   for_cow); +				   no_quota);  	spin_unlock(&delayed_refs->lock); -	if (need_ref_seq(for_cow, ref_root)) -		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);  	return 0;  } @@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  			       u64 parent, u64 ref_root,  			       u64 owner, u64 offset, int action,  			       struct btrfs_delayed_extent_op *extent_op, -			       int for_cow) +			       int no_quota)  {  	struct btrfs_delayed_data_ref *ref;  	struct btrfs_delayed_ref_head *head_ref;  	struct btrfs_delayed_ref_root *delayed_refs; +	if (!is_fstree(ref_root) || !fs_info->quota_enabled) +		no_quota = 0; +  	BUG_ON(extent_op && !extent_op->is_data);  	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);  	if (!ref) @@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,  				   num_bytes, parent, ref_root, owner, offset, -				   action, for_cow); +				   action, no_quota);  	spin_unlock(&delayed_refs->lock); -	if (need_ref_seq(for_cow, ref_root)) -		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);  	return 0;  } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4ba9b93022ff..a764e2340d48 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {  	unsigned int action:8;  	unsigned int type:8; +	unsigned int no_quota:1;  	/* is this node still in the rbtree? */  	unsigned int is_head:1;  	unsigned int in_tree:1; @@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,  			       u64 bytenr, u64 num_bytes, u64 parent,  			       u64 ref_root, int level, int action,  			       struct btrfs_delayed_extent_op *extent_op, -			       int for_cow); +			       int no_quota);  int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,  			       struct btrfs_trans_handle *trans,  			       u64 bytenr, u64 num_bytes,  			       u64 parent, u64 ref_root,  			       u64 owner, u64 offset, int action,  			       struct btrfs_delayed_extent_op *extent_op, -			       int for_cow); +			       int no_quota);  int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,  				struct btrfs_trans_handle *trans,  				u64 bytenr, u64 num_bytes, @@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,  			    u64 seq);  /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ -	if (for_cow) -		return 0; - -	if (rootid == BTRFS_FS_TREE_OBJECTID) -		return 1; - -	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) -		return 1; - -	return 0; -} - -/*   * a node might live in a head or a regular ref, this lets you   * test for the proper type to use.   */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9f2290509aca..2af6e66fe788 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,7 +313,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,  	if (btrfs_fs_incompat(fs_info, RAID56)) {  		btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); -		return -EINVAL; +		return -EOPNOTSUPP;  	}  	switch (args->start.cont_reading_from_srcdev_mode) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 983314932af3..8bb4aa19898f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@  #include "dev-replace.h"  #include "raid56.h"  #include "sysfs.h" +#include "qgroup.h"  #ifdef CONFIG_X86  #include <asm/cpufeature.h> @@ -1109,6 +1110,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,  						 u64 bytenr, u32 blocksize)  { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return alloc_test_extent_buffer(root->fs_info, bytenr, +						blocksize); +#endif  	return alloc_extent_buffer(root->fs_info, bytenr, blocksize);  } @@ -1201,10 +1207,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	root->nodesize = nodesize;  	root->leafsize = leafsize;  	root->stripesize = stripesize; -	root->ref_cows = 0; -	root->track_dirty = 0; -	root->in_radix = 0; -	root->orphan_item_inserted = 0; +	root->state = 0;  	root->orphan_cleanup_state = 0;  	root->objectid = objectid; @@ -1265,7 +1268,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	else  		root->defrag_trans_start = 0;  	init_completion(&root->kobj_unregister); -	root->defrag_running = 0;  	root->root_key.objectid = objectid;  	root->anon_dev = 0; @@ -1290,7 +1292,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)  	if (!root)  		return ERR_PTR(-ENOMEM);  	__setup_root(4096, 4096, 4096, 4096, root, NULL, 1); -	root->dummy_root = 1; +	set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); +	root->alloc_bytenr = 0;  	return root;  } @@ -1341,8 +1344,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	root->commit_root = btrfs_root_node(root); -	root->track_dirty = 1; - +	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);  	root->root_item.flags = 0;  	root->root_item.byte_limit = 0; @@ -1371,6 +1373,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  fail:  	if (leaf) {  		btrfs_tree_unlock(leaf); +		free_extent_buffer(root->commit_root);  		free_extent_buffer(leaf);  	}  	kfree(root); @@ -1396,13 +1399,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,  	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;  	root->root_key.type = BTRFS_ROOT_ITEM_KEY;  	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; +  	/* +	 * DON'T set REF_COWS for log trees +	 *  	 * log trees do not get reference counted because they go away  	 * before a real commit is actually done.  They do store pointers  	 * to file data extents, and those reference counts still get  	 * updated (along with back refs to the log tree).  	 */ -	root->ref_cows = 0;  	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,  				      BTRFS_TREE_LOG_OBJECTID, NULL, @@ -1536,7 +1541,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,  		return root;  	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { -		root->ref_cows = 1; +		set_bit(BTRFS_ROOT_REF_COWS, &root->state);  		btrfs_check_and_init_root_item(&root->root_item);  	} @@ -1606,7 +1611,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  				(unsigned long)root->root_key.objectid,  				root);  	if (ret == 0) -		root->in_radix = 1; +		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);  	spin_unlock(&fs_info->fs_roots_radix_lock);  	radix_tree_preload_end(); @@ -1662,7 +1667,7 @@ again:  	if (ret < 0)  		goto fail;  	if (ret == 0) -		root->orphan_item_inserted = 1; +		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);  	ret = btrfs_insert_fs_root(fs_info, root);  	if (ret) { @@ -2064,6 +2069,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)  	btrfs_destroy_workqueue(fs_info->readahead_workers);  	btrfs_destroy_workqueue(fs_info->flush_workers);  	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); +	btrfs_destroy_workqueue(fs_info->extent_workers);  }  static void free_root_extent_buffers(struct btrfs_root *root) @@ -2090,7 +2096,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)  		free_root_extent_buffers(info->chunk_root);  } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)  {  	int ret;  	struct btrfs_root *gang[8]; @@ -2101,7 +2107,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)  				     struct btrfs_root, root_list);  		list_del(&gang[0]->root_list); -		if (gang[0]->in_radix) { +		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {  			btrfs_drop_and_free_fs_root(fs_info, gang[0]);  		} else {  			free_extent_buffer(gang[0]->node); @@ -2221,6 +2227,7 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->free_chunk_lock);  	spin_lock_init(&fs_info->tree_mod_seq_lock);  	spin_lock_init(&fs_info->super_lock); +	spin_lock_init(&fs_info->qgroup_op_lock);  	spin_lock_init(&fs_info->buffer_lock);  	rwlock_init(&fs_info->tree_mod_log_lock);  	mutex_init(&fs_info->reloc_mutex); @@ -2246,6 +2253,7 @@ int open_ctree(struct super_block *sb,  	atomic_set(&fs_info->async_submit_draining, 0);  	atomic_set(&fs_info->nr_async_bios, 0);  	atomic_set(&fs_info->defrag_running, 0); +	atomic_set(&fs_info->qgroup_op_seq, 0);  	atomic64_set(&fs_info->tree_mod_seq, 0);  	fs_info->sb = sb;  	fs_info->max_inline = 8192 * 1024; @@ -2291,6 +2299,7 @@ int open_ctree(struct super_block *sb,  	atomic_set(&fs_info->balance_cancel_req, 0);  	fs_info->balance_ctl = NULL;  	init_waitqueue_head(&fs_info->balance_wait_q); +	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);  	sb->s_blocksize = 4096;  	sb->s_blocksize_bits = blksize_bits(4096); @@ -2354,6 +2363,7 @@ int open_ctree(struct super_block *sb,  	spin_lock_init(&fs_info->qgroup_lock);  	mutex_init(&fs_info->qgroup_ioctl_lock);  	fs_info->qgroup_tree = RB_ROOT; +	fs_info->qgroup_op_tree = RB_ROOT;  	INIT_LIST_HEAD(&fs_info->dirty_qgroups);  	fs_info->qgroup_seq = 1;  	fs_info->quota_enabled = 0; @@ -2577,6 +2587,10 @@ int open_ctree(struct super_block *sb,  		btrfs_alloc_workqueue("readahead", flags, max_active, 2);  	fs_info->qgroup_rescan_workers =  		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); +	fs_info->extent_workers = +		btrfs_alloc_workqueue("extent-refs", flags, +				      min_t(u64, fs_devices->num_devices, +					    max_active), 8);  	if (!(fs_info->workers && fs_info->delalloc_workers &&  	      fs_info->submit_workers && fs_info->flush_workers && @@ -2586,6 +2600,7 @@ int open_ctree(struct super_block *sb,  	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&  	      fs_info->caching_workers && fs_info->readahead_workers &&  	      fs_info->fixup_workers && fs_info->delayed_workers && +	      fs_info->fixup_workers && fs_info->extent_workers &&  	      fs_info->qgroup_rescan_workers)) {  		err = -ENOMEM;  		goto fail_sb_buffer; @@ -2693,7 +2708,7 @@ retry_root_backup:  		ret = PTR_ERR(extent_root);  		goto recovery_tree_root;  	} -	extent_root->track_dirty = 1; +	set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);  	fs_info->extent_root = extent_root;  	location.objectid = BTRFS_DEV_TREE_OBJECTID; @@ -2702,7 +2717,7 @@ retry_root_backup:  		ret = PTR_ERR(dev_root);  		goto recovery_tree_root;  	} -	dev_root->track_dirty = 1; +	set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);  	fs_info->dev_root = dev_root;  	btrfs_init_devices_late(fs_info); @@ -2712,13 +2727,13 @@ retry_root_backup:  		ret = PTR_ERR(csum_root);  		goto recovery_tree_root;  	} -	csum_root->track_dirty = 1; +	set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);  	fs_info->csum_root = csum_root;  	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;  	quota_root = btrfs_read_tree_root(tree_root, &location);  	if (!IS_ERR(quota_root)) { -		quota_root->track_dirty = 1; +		set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state);  		fs_info->quota_enabled = 1;  		fs_info->pending_quota_state = 1;  		fs_info->quota_root = quota_root; @@ -2733,7 +2748,7 @@ retry_root_backup:  		create_uuid_tree = true;  		check_uuid_tree = false;  	} else { -		uuid_root->track_dirty = 1; +		set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);  		fs_info->uuid_root = uuid_root;  		create_uuid_tree = false;  		check_uuid_tree = @@ -2966,7 +2981,7 @@ fail_qgroup:  fail_trans_kthread:  	kthread_stop(fs_info->transaction_kthread);  	btrfs_cleanup_transaction(fs_info->tree_root); -	del_fs_roots(fs_info); +	btrfs_free_fs_roots(fs_info);  fail_cleaner:  	kthread_stop(fs_info->cleaner_kthread); @@ -3501,8 +3516,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))  		btrfs_free_log(NULL, root); -	__btrfs_remove_free_space_cache(root->free_ino_pinned); -	__btrfs_remove_free_space_cache(root->free_ino_ctl); +	if (root->free_ino_pinned) +		__btrfs_remove_free_space_cache(root->free_ino_pinned); +	if (root->free_ino_ctl) +		__btrfs_remove_free_space_cache(root->free_ino_ctl);  	free_fs_root(root);  } @@ -3533,28 +3550,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)  {  	u64 root_objectid = 0;  	struct btrfs_root *gang[8]; -	int i; -	int ret; +	int i = 0; +	int err = 0; +	unsigned int ret = 0; +	int index;  	while (1) { +		index = srcu_read_lock(&fs_info->subvol_srcu);  		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,  					     (void **)gang, root_objectid,  					     ARRAY_SIZE(gang)); -		if (!ret) +		if (!ret) { +			srcu_read_unlock(&fs_info->subvol_srcu, index);  			break; - +		}  		root_objectid = gang[ret - 1]->root_key.objectid + 1; +  		for (i = 0; i < ret; i++) { -			int err; +			/* Avoid to grab roots in dead_roots */ +			if (btrfs_root_refs(&gang[i]->root_item) == 0) { +				gang[i] = NULL; +				continue; +			} +			/* grab all the search result for later use */ +			gang[i] = btrfs_grab_fs_root(gang[i]); +		} +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		for (i = 0; i < ret; i++) { +			if (!gang[i]) +				continue;  			root_objectid = gang[i]->root_key.objectid;  			err = btrfs_orphan_cleanup(gang[i]);  			if (err) -				return err; +				break; +			btrfs_put_fs_root(gang[i]);  		}  		root_objectid++;  	} -	return 0; + +	/* release the uncleaned roots due to error */ +	for (; i < ret; i++) { +		if (gang[i]) +			btrfs_put_fs_root(gang[i]); +	} +	return err;  }  int btrfs_commit_super(struct btrfs_root *root) @@ -3603,6 +3643,8 @@ int close_ctree(struct btrfs_root *root)  	/* clear out the rbtree of defraggable inodes */  	btrfs_cleanup_defrag_inodes(fs_info); +	cancel_work_sync(&fs_info->async_reclaim_work); +  	if (!(fs_info->sb->s_flags & MS_RDONLY)) {  		ret = btrfs_commit_super(root);  		if (ret) @@ -3627,12 +3669,17 @@ int close_ctree(struct btrfs_root *root)  	btrfs_sysfs_remove_one(fs_info); -	del_fs_roots(fs_info); +	btrfs_free_fs_roots(fs_info);  	btrfs_put_block_group_cache(fs_info);  	btrfs_free_block_groups(fs_info); +	/* +	 * we must make sure there is not any read request to +	 * submit after we stopping all workers. +	 */ +	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);  	btrfs_stop_all_workers(fs_info);  	free_root_pointers(fs_info, 1); @@ -3709,6 +3756,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,  				     buf->len,  				     root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { +		btrfs_print_leaf(root, buf); +		ASSERT(0); +	} +#endif  }  static void __btrfs_btree_balance_dirty(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 53059df350f8..23ce3ceba0a9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,  int btrfs_init_fs_root(struct btrfs_root *root);  int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  			 struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);  struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,  				     struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5590af92094b..fafb3e53ecde 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,16 +26,16 @@  #include <linux/ratelimit.h>  #include <linux/percpu_counter.h>  #include "hash.h" -#include "ctree.h" +#include "tree-log.h"  #include "disk-io.h"  #include "print-tree.h" -#include "transaction.h"  #include "volumes.h"  #include "raid56.h"  #include "locking.h"  #include "free-space-cache.h"  #include "math.h"  #include "sysfs.h" +#include "qgroup.h"  #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extra_op); +				struct btrfs_delayed_extent_op *extra_op, +				int no_quota);  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,  				    struct extent_buffer *leaf,  				    struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins); +				     int level, struct btrfs_key *ins, +				     int no_quota);  static int do_chunk_alloc(struct btrfs_trans_handle *trans,  			  struct btrfs_root *extent_root, u64 flags,  			  int force); @@ -1271,7 +1273,7 @@ fail:  static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  					   struct btrfs_root *root,  					   struct btrfs_path *path, -					   int refs_to_drop) +					   int refs_to_drop, int *last_ref)  {  	struct btrfs_key key;  	struct btrfs_extent_data_ref *ref1 = NULL; @@ -1307,6 +1309,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  	if (num_refs == 0) {  		ret = btrfs_del_item(trans, root, path); +		*last_ref = 1;  	} else {  		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1764,7 +1767,8 @@ void update_inline_extent_backref(struct btrfs_root *root,  				  struct btrfs_path *path,  				  struct btrfs_extent_inline_ref *iref,  				  int refs_to_mod, -				  struct btrfs_delayed_extent_op *extent_op) +				  struct btrfs_delayed_extent_op *extent_op, +				  int *last_ref)  {  	struct extent_buffer *leaf;  	struct btrfs_extent_item *ei; @@ -1808,6 +1812,7 @@ void update_inline_extent_backref(struct btrfs_root *root,  		else  			btrfs_set_shared_data_ref_count(leaf, sref, refs);  	} else { +		*last_ref = 1;  		size =  btrfs_extent_inline_ref_size(type);  		item_size = btrfs_item_size_nr(leaf, path->slots[0]);  		ptr = (unsigned long)iref; @@ -1839,7 +1844,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,  	if (ret == 0) {  		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);  		update_inline_extent_backref(root, path, iref, -					     refs_to_add, extent_op); +					     refs_to_add, extent_op, NULL);  	} else if (ret == -ENOENT) {  		setup_inline_extent_backref(root, path, iref, parent,  					    root_objectid, owner, offset, @@ -1872,17 +1877,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct btrfs_path *path,  				 struct btrfs_extent_inline_ref *iref, -				 int refs_to_drop, int is_data) +				 int refs_to_drop, int is_data, int *last_ref)  {  	int ret = 0;  	BUG_ON(!is_data && refs_to_drop != 1);  	if (iref) {  		update_inline_extent_backref(root, path, iref, -					     -refs_to_drop, NULL); +					     -refs_to_drop, NULL, last_ref);  	} else if (is_data) { -		ret = remove_extent_data_ref(trans, root, path, refs_to_drop); +		ret = remove_extent_data_ref(trans, root, path, refs_to_drop, +					     last_ref);  	} else { +		*last_ref = 1;  		ret = btrfs_del_item(trans, root, path);  	}  	return ret; @@ -1946,7 +1953,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 u64 bytenr, u64 num_bytes, u64 parent, -			 u64 root_objectid, u64 owner, u64 offset, int for_cow) +			 u64 root_objectid, u64 owner, u64 offset, +			 int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -1958,12 +1966,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_ADD_DELAYED_REF, NULL, for_cow); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	} else {  		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, owner, offset, -					BTRFS_ADD_DELAYED_REF, NULL, for_cow); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	}  	return ret;  } @@ -1973,31 +1981,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  				  u64 bytenr, u64 num_bytes,  				  u64 parent, u64 root_objectid,  				  u64 owner, u64 offset, int refs_to_add, +				  int no_quota,  				  struct btrfs_delayed_extent_op *extent_op)  { +	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_extent_item *item; +	struct btrfs_key key;  	u64 refs;  	int ret; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; +	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) +		no_quota = 1; +  	path->reada = 1;  	path->leave_spinning = 1;  	/* this will setup the path even if it fails to insert the back ref */ -	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, -					   path, bytenr, num_bytes, parent, +	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, +					   bytenr, num_bytes, parent,  					   root_objectid, owner, offset,  					   refs_to_add, extent_op); -	if (ret != -EAGAIN) +	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))  		goto out; +	/* +	 * Ok we were able to insert an inline extent and it appears to be a new +	 * reference, deal with the qgroup accounting. +	 */ +	if (!ret && !no_quota) { +		ASSERT(root->fs_info->quota_enabled); +		leaf = path->nodes[0]; +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		item = btrfs_item_ptr(leaf, path->slots[0], +				      struct btrfs_extent_item); +		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) +			type = BTRFS_QGROUP_OPER_ADD_SHARED; +		btrfs_release_path(path); +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0); +		goto out; +	} + +	/* +	 * Ok we had -EAGAIN which means we didn't have space to insert and +	 * inline extent ref, so just update the reference count and add a +	 * normal backref. +	 */  	leaf = path->nodes[0]; +	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	refs = btrfs_extent_refs(leaf, item); +	if (refs) +		type = BTRFS_QGROUP_OPER_ADD_SHARED;  	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);  	if (extent_op)  		__run_delayed_extent_op(extent_op, leaf, item); @@ -2005,9 +2046,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_release_path(path); +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0); +		if (ret) +			goto out; +	} +  	path->reada = 1;  	path->leave_spinning = 1; -  	/* now insert the actual backref */  	ret = insert_extent_backref(trans, root->fs_info->extent_root,  				    path, bytenr, parent, root_objectid, @@ -2041,8 +2088,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  	if (node->type == BTRFS_SHARED_DATA_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root;  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {  		if (extent_op) @@ -2056,13 +2102,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  					     node->num_bytes, parent,  					     ref_root, ref->objectid,  					     ref->offset, node->ref_mod, -					     extent_op); +					     node->no_quota, extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent,  					  ref_root, ref->objectid,  					  ref->offset, node->ref_mod, -					  extent_op); +					  extent_op, node->no_quota);  	} else {  		BUG();  	} @@ -2199,8 +2245,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root;  	ins.objectid = node->bytenr;  	if (skinny_metadata) { @@ -2218,15 +2263,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,  						parent, ref_root,  						extent_op->flags_to_set,  						&extent_op->key, -						ref->level, &ins); +						ref->level, &ins, +						node->no_quota);  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {  		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,  					     node->num_bytes, parent, ref_root, -					     ref->level, 0, 1, extent_op); +					     ref->level, 0, 1, node->no_quota, +					     extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent, ref_root, -					  ref->level, 0, 1, extent_op); +					  ref->level, 0, 1, extent_op, +					  node->no_quota);  	} else {  		BUG();  	} @@ -2574,42 +2622,6 @@ static u64 find_middle(struct rb_root *root)  }  #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, -					 struct btrfs_fs_info *fs_info) -{ -	struct qgroup_update *qgroup_update; -	int ret = 0; - -	if (list_empty(&trans->qgroup_ref_list) != -	    !trans->delayed_ref_elem.seq) { -		/* list without seq or seq without list */ -		btrfs_err(fs_info, -			"qgroup accounting update error, list is%s empty, seq is %#x.%x", -			list_empty(&trans->qgroup_ref_list) ? "" : " not", -			(u32)(trans->delayed_ref_elem.seq >> 32), -			(u32)trans->delayed_ref_elem.seq); -		BUG(); -	} - -	if (!trans->delayed_ref_elem.seq) -		return 0; - -	while (!list_empty(&trans->qgroup_ref_list)) { -		qgroup_update = list_first_entry(&trans->qgroup_ref_list, -						 struct qgroup_update, list); -		list_del(&qgroup_update->list); -		if (!ret) -			ret = btrfs_qgroup_account_ref( -					trans, fs_info, qgroup_update->node, -					qgroup_update->extent_op); -		kfree(qgroup_update); -	} - -	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - -	return ret; -} -  static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)  {  	u64 num_bytes; @@ -2662,15 +2674,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,  	u64 num_entries =  		atomic_read(&trans->transaction->delayed_refs.num_entries);  	u64 avg_runtime; +	u64 val;  	smp_mb();  	avg_runtime = fs_info->avg_delayed_ref_runtime; +	val = num_entries * avg_runtime;  	if (num_entries * avg_runtime >= NSEC_PER_SEC)  		return 1; +	if (val >= NSEC_PER_SEC / 2) +		return 2;  	return btrfs_check_space_for_delayed_refs(trans, root);  } +struct async_delayed_refs { +	struct btrfs_root *root; +	int count; +	int error; +	int sync; +	struct completion wait; +	struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ +	struct async_delayed_refs *async; +	struct btrfs_trans_handle *trans; +	int ret; + +	async = container_of(work, struct async_delayed_refs, work); + +	trans = btrfs_join_transaction(async->root); +	if (IS_ERR(trans)) { +		async->error = PTR_ERR(trans); +		goto done; +	} + +	/* +	 * trans->sync means that when we call end_transaciton, we won't +	 * wait on delayed refs +	 */ +	trans->sync = true; +	ret = btrfs_run_delayed_refs(trans, async->root, async->count); +	if (ret) +		async->error = ret; + +	ret = btrfs_end_transaction(trans, async->root); +	if (ret && !async->error) +		async->error = ret; +done: +	if (async->sync) +		complete(&async->wait); +	else +		kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, +				 unsigned long count, int wait) +{ +	struct async_delayed_refs *async; +	int ret; + +	async = kmalloc(sizeof(*async), GFP_NOFS); +	if (!async) +		return -ENOMEM; + +	async->root = root->fs_info->tree_root; +	async->count = count; +	async->error = 0; +	if (wait) +		async->sync = 1; +	else +		async->sync = 0; +	init_completion(&async->wait); + +	btrfs_init_work(&async->work, delayed_ref_async_start, +			NULL, NULL); + +	btrfs_queue_work(root->fs_info->extent_workers, &async->work); + +	if (wait) { +		wait_for_completion(&async->wait); +		ret = async->error; +		kfree(async); +		return ret; +	} +	return 0; +} +  /*   * this starts processing the delayed reference count updates and   * extent insertions we have queued up so far.  count can be @@ -2698,8 +2789,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	if (root == root->fs_info->extent_root)  		root = root->fs_info->tree_root; -	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); -  	delayed_refs = &trans->transaction->delayed_refs;  	if (count == 0) {  		count = atomic_read(&delayed_refs->num_entries) * 2; @@ -2758,6 +2847,9 @@ again:  		goto again;  	}  out: +	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); +	if (ret) +		return ret;  	assert_qgroups_uptodate(trans);  	return 0;  } @@ -2964,7 +3056,7 @@ out:  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   int full_backref, int inc, int for_cow) +			   int full_backref, int inc, int no_quota)  {  	u64 bytenr;  	u64 num_bytes; @@ -2979,11 +3071,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,  			    u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	ref_root = btrfs_header_owner(buf);  	nritems = btrfs_header_nritems(buf);  	level = btrfs_header_level(buf); -	if (!root->ref_cows && level == 0) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)  		return 0;  	if (inc) @@ -3014,7 +3110,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			key.offset -= btrfs_file_extent_offset(buf, fi);  			ret = process_func(trans, root, bytenr, num_bytes,  					   parent, ref_root, key.objectid, -					   key.offset, for_cow); +					   key.offset, no_quota);  			if (ret)  				goto fail;  		} else { @@ -3022,7 +3118,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			num_bytes = btrfs_level_size(root, level - 1);  			ret = process_func(trans, root, bytenr, num_bytes,  					   parent, ref_root, level - 1, 0, -					   for_cow); +					   no_quota);  			if (ret)  				goto fail;  		} @@ -3033,15 +3129,15 @@ fail:  }  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);  }  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);  }  static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3401,10 +3497,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  		return ret;  	} -	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { +	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)  		INIT_LIST_HEAD(&found->block_groups[i]); -		kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); -	}  	init_rwsem(&found->groups_sem);  	spin_lock_init(&found->lock);  	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -4204,6 +4298,104 @@ static int flush_space(struct btrfs_root *root,  	return ret;  } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, +				 struct btrfs_space_info *space_info) +{ +	u64 used; +	u64 expected; +	u64 to_reclaim; + +	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, +				16 * 1024 * 1024); +	spin_lock(&space_info->lock); +	if (can_overcommit(root, space_info, to_reclaim, +			   BTRFS_RESERVE_FLUSH_ALL)) { +		to_reclaim = 0; +		goto out; +	} + +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (can_overcommit(root, space_info, 1024 * 1024, +			   BTRFS_RESERVE_FLUSH_ALL)) +		expected = div_factor_fine(space_info->total_bytes, 95); +	else +		expected = div_factor_fine(space_info->total_bytes, 90); + +	if (used > expected) +		to_reclaim = used - expected; +	else +		to_reclaim = 0; +	to_reclaim = min(to_reclaim, space_info->bytes_may_use + +				     space_info->bytes_reserved); +out: +	spin_unlock(&space_info->lock); + +	return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, +					struct btrfs_fs_info *fs_info, u64 used) +{ +	return (used >= div_factor_fine(space_info->total_bytes, 98) && +		!btrfs_fs_closing(fs_info) && +		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, +				       struct btrfs_fs_info *fs_info) +{ +	u64 used; + +	spin_lock(&space_info->lock); +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (need_do_async_reclaim(space_info, fs_info, used)) { +		spin_unlock(&space_info->lock); +		return 1; +	} +	spin_unlock(&space_info->lock); + +	return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ +	struct btrfs_fs_info *fs_info; +	struct btrfs_space_info *space_info; +	u64 to_reclaim; +	int flush_state; + +	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); +	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, +						      space_info); +	if (!to_reclaim) +		return; + +	flush_state = FLUSH_DELAYED_ITEMS_NR; +	do { +		flush_space(fs_info->fs_root, space_info, to_reclaim, +			    to_reclaim, flush_state); +		flush_state++; +		if (!btrfs_need_do_async_reclaim(space_info, fs_info)) +			return; +	} while (flush_state <= COMMIT_TRANS); + +	if (btrfs_need_do_async_reclaim(space_info, fs_info)) +		queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ +	INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} +  /**   * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space   * @root - the root we're allocating for @@ -4311,8 +4503,13 @@ again:  	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {  		flushing = true;  		space_info->flush = 1; +	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { +		used += orig_bytes; +		if (need_do_async_reclaim(space_info, root->fs_info, used) && +		    !work_busy(&root->fs_info->async_reclaim_work)) +			queue_work(system_unbound_wq, +				   &root->fs_info->async_reclaim_work);  	} -  	spin_unlock(&space_info->lock);  	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4369,7 +4566,7 @@ static struct btrfs_block_rsv *get_block_rsv(  {  	struct btrfs_block_rsv *block_rsv = NULL; -	if (root->ref_cows) +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		block_rsv = trans->block_rsv;  	if (root == root->fs_info->csum_root && trans->adding_csums) @@ -5621,7 +5818,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extent_op) +				struct btrfs_delayed_extent_op *extent_op, +				int no_quota)  {  	struct btrfs_key key;  	struct btrfs_path *path; @@ -5637,9 +5835,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	int num_to_del = 1;  	u32 item_size;  	u64 refs; +	int last_ref = 0; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); +	if (!info->quota_enabled || !is_fstree(root_objectid)) +		no_quota = 1; +  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -5687,7 +5890,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			BUG_ON(iref);  			ret = remove_extent_backref(trans, extent_root, path,  						    NULL, refs_to_drop, -						    is_data); +						    is_data, &last_ref);  			if (ret) {  				btrfs_abort_transaction(trans, extent_root, ret);  				goto out; @@ -5806,7 +6009,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	refs = btrfs_extent_refs(leaf, ei);  	if (refs < refs_to_drop) {  		btrfs_err(info, "trying to drop %d refs but we only have %Lu " -			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr); +			  "for bytenr %Lu", refs_to_drop, refs, bytenr);  		ret = -EINVAL;  		btrfs_abort_transaction(trans, extent_root, ret);  		goto out; @@ -5814,6 +6017,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	refs -= refs_to_drop;  	if (refs > 0) { +		type = BTRFS_QGROUP_OPER_SUB_SHARED;  		if (extent_op)  			__run_delayed_extent_op(extent_op, leaf, ei);  		/* @@ -5829,7 +6033,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		if (found_extent) {  			ret = remove_extent_backref(trans, extent_root, path,  						    iref, refs_to_drop, -						    is_data); +						    is_data, &last_ref);  			if (ret) {  				btrfs_abort_transaction(trans, extent_root, ret);  				goto out; @@ -5850,6 +6054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  		} +		last_ref = 1;  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],  				      num_to_del);  		if (ret) { @@ -5872,6 +6077,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			goto out;  		}  	} +	btrfs_release_path(path); + +	/* Deal with the quota accounting */ +	if (!ret && last_ref && !no_quota) { +		int mod_seq = 0; + +		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && +		    type == BTRFS_QGROUP_OPER_SUB_SHARED) +			mod_seq = 1; + +		ret = btrfs_qgroup_record_ref(trans, info, root_objectid, +					      bytenr, num_bytes, type, +					      mod_seq); +	}  out:  	btrfs_free_path(path);  	return ret; @@ -6008,11 +6227,15 @@ out:  /* Can return -ENOMEM */  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, -		      u64 owner, u64 offset, int for_cow) +		      u64 owner, u64 offset, int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);  	/* @@ -6028,13 +6251,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_DROP_DELAYED_REF, NULL, for_cow); +					BTRFS_DROP_DELAYED_REF, NULL, no_quota);  	} else {  		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,  						num_bytes,  						parent, root_objectid, owner,  						offset, BTRFS_DROP_DELAYED_REF, -						NULL, for_cow); +						NULL, no_quota);  	}  	return ret;  } @@ -6514,8 +6737,14 @@ loop:  		loop++;  		if (loop == LOOP_ALLOC_CHUNK) {  			struct btrfs_trans_handle *trans; +			int exist = 0; + +			trans = current->journal_info; +			if (trans) +				exist = 1; +			else +				trans = btrfs_join_transaction(root); -			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans)) {  				ret = PTR_ERR(trans);  				goto out; @@ -6532,7 +6761,8 @@ loop:  							root, ret);  			else  				ret = 0; -			btrfs_end_transaction(trans, root); +			if (!exist) +				btrfs_end_transaction(trans, root);  			if (ret)  				goto out;  		} @@ -6733,6 +6963,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); +	/* Always set parent to 0 here since its exclusive anyway. */ +	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +				      ins->objectid, ins->offset, +				      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +	if (ret) +		return ret; +  	ret = update_block_group(root, ins->objectid, ins->offset, 1);  	if (ret) { /* -ENOENT, logic error */  		btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6747,7 +6984,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins) +				     int level, struct btrfs_key *ins, +				     int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -6757,6 +6995,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	u32 size = sizeof(*extent_item) + sizeof(*iref); +	u64 num_bytes = ins->offset;  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); @@ -6790,6 +7029,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	if (skinny_metadata) {  		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); +		num_bytes = root->leafsize;  	} else {  		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);  		btrfs_set_tree_block_key(leaf, block_info, key); @@ -6811,6 +7051,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_free_path(path); +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      ins->objectid, num_bytes, +					      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +		if (ret) +			return ret; +	} +  	ret = update_block_group(root, ins->objectid, root->leafsize, 1);  	if (ret) { /* -ENOENT, logic error */  		btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6994,6 +7242,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { +		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, +					    blocksize, level); +		if (!IS_ERR(buf)) +			root->alloc_bytenr += blocksize; +		return buf; +	} +#endif  	block_rsv = use_block_rsv(trans, root, blocksize);  	if (IS_ERR(block_rsv))  		return ERR_CAST(block_rsv); @@ -7735,7 +7992,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		}  	} -	if (root->in_radix) { +	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {  		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);  	} else {  		free_extent_buffer(root->node); @@ -8327,8 +8584,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  		list_del(&space_info->list);  		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {  			struct kobject *kobj; -			kobj = &space_info->block_group_kobjs[i]; -			if (kobj->parent) { +			kobj = space_info->block_group_kobjs[i]; +			space_info->block_group_kobjs[i] = NULL; +			if (kobj) {  				kobject_del(kobj);  				kobject_put(kobj);  			} @@ -8352,17 +8610,26 @@ static void __link_block_group(struct btrfs_space_info *space_info,  	up_write(&space_info->groups_sem);  	if (first) { -		struct kobject *kobj = &space_info->block_group_kobjs[index]; +		struct raid_kobject *rkobj;  		int ret; -		kobject_get(&space_info->kobj); /* put in release */ -		ret = kobject_add(kobj, &space_info->kobj, "%s", -				  get_raid_name(index)); +		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); +		if (!rkobj) +			goto out_err; +		rkobj->raid_type = index; +		kobject_init(&rkobj->kobj, &btrfs_raid_ktype); +		ret = kobject_add(&rkobj->kobj, &space_info->kobj, +				  "%s", get_raid_name(index));  		if (ret) { -			pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); -			kobject_put(&space_info->kobj); +			kobject_put(&rkobj->kobj); +			goto out_err;  		} +		space_info->block_group_kobjs[index] = &rkobj->kobj;  	} + +	return; +out_err: +	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");  }  static struct btrfs_block_group_cache * @@ -8611,7 +8878,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	extent_root = root->fs_info->extent_root; -	root->fs_info->last_trans_log_full_commit = trans->transid; +	btrfs_set_log_full_commit(root->fs_info, trans);  	cache = btrfs_create_block_group_cache(root, chunk_offset, size);  	if (!cache) @@ -8697,6 +8964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	struct btrfs_root *tree_root = root->fs_info->tree_root;  	struct btrfs_key key;  	struct inode *inode; +	struct kobject *kobj = NULL;  	int ret;  	int index;  	int factor; @@ -8796,11 +9064,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	 */  	list_del_init(&block_group->list);  	if (list_empty(&block_group->space_info->block_groups[index])) { -		kobject_del(&block_group->space_info->block_group_kobjs[index]); -		kobject_put(&block_group->space_info->block_group_kobjs[index]); +		kobj = block_group->space_info->block_group_kobjs[index]; +		block_group->space_info->block_group_kobjs[index] = NULL;  		clear_avail_alloc_bits(root->fs_info, block_group->flags);  	}  	up_write(&block_group->space_info->groups_sem); +	if (kobj) { +		kobject_del(kobj); +		kobject_put(kobj); +	}  	if (block_group->cached == BTRFS_CACHE_STARTED)  		wait_block_group_cache_done(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475ceec..f25a9092b946 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1693,6 +1693,7 @@ again:  		 * shortening the size of the delalloc range we're searching  		 */  		free_extent_state(cached_state); +		cached_state = NULL;  		if (!loops) {  			max_bytes = PAGE_CACHE_SIZE;  			loops = 1; @@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)  	if (!uptodate) {  		ClearPageUptodate(page);  		SetPageError(page); +		ret = ret < 0 ? ret : -EIO; +		mapping_set_error(page->mapping, ret);  	}  	return 0;  } @@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,  }  /* - * the writepage semantics are similar to regular writepage.  extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback.  Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent).  In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked)   */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, -			      void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, +			      struct page *page, struct writeback_control *wbc, +			      struct extent_page_data *epd, +			      u64 delalloc_start, +			      unsigned long *nr_written) +{ +	struct extent_io_tree *tree = epd->tree; +	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; +	u64 nr_delalloc; +	u64 delalloc_to_write = 0; +	u64 delalloc_end = 0; +	int ret; +	int page_started = 0; + +	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) +		return 0; + +	while (delalloc_end < page_end) { +		nr_delalloc = find_lock_delalloc_range(inode, tree, +					       page, +					       &delalloc_start, +					       &delalloc_end, +					       128 * 1024 * 1024); +		if (nr_delalloc == 0) { +			delalloc_start = delalloc_end + 1; +			continue; +		} +		ret = tree->ops->fill_delalloc(inode, page, +					       delalloc_start, +					       delalloc_end, +					       &page_started, +					       nr_written); +		/* File system has been set read-only */ +		if (ret) { +			SetPageError(page); +			/* fill_delalloc should be return < 0 for error +			 * but just in case, we use > 0 here meaning the +			 * IO is started, so we don't want to return > 0 +			 * unless things are going well. +			 */ +			ret = ret < 0 ? ret : -EIO; +			goto done; +		} +		/* +		 * delalloc_end is already one less than the total +		 * length, so we don't subtract one from +		 * PAGE_CACHE_SIZE +		 */ +		delalloc_to_write += (delalloc_end - delalloc_start + +				      PAGE_CACHE_SIZE) >> +				      PAGE_CACHE_SHIFT; +		delalloc_start = delalloc_end + 1; +	} +	if (wbc->nr_to_write < delalloc_to_write) { +		int thresh = 8192; + +		if (delalloc_to_write < thresh * 2) +			thresh = delalloc_to_write; +		wbc->nr_to_write = min_t(u64, delalloc_to_write, +					 thresh); +	} + +	/* did the fill delalloc function already unlock and start +	 * the IO? +	 */ +	if (page_started) { +		/* +		 * we've unlocked the page, so we can't update +		 * the mapping's writeback index, just update +		 * nr_to_write. +		 */ +		wbc->nr_to_write -= *nr_written; +		return 1; +	} + +	ret = 0; + +done: +	return ret; +} + +/* + * helper for __extent_writepage.  This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, +				 struct page *page, +				 struct writeback_control *wbc, +				 struct extent_page_data *epd, +				 loff_t i_size, +				 unsigned long nr_written, +				 int write_flags, int *nr_ret)  { -	struct inode *inode = page->mapping->host; -	struct extent_page_data *epd = data;  	struct extent_io_tree *tree = epd->tree;  	u64 start = page_offset(page); -	u64 delalloc_start;  	u64 page_end = start + PAGE_CACHE_SIZE - 1;  	u64 end;  	u64 cur = start;  	u64 extent_offset; -	u64 last_byte = i_size_read(inode);  	u64 block_start;  	u64 iosize;  	sector_t sector;  	struct extent_state *cached_state = NULL;  	struct extent_map *em;  	struct block_device *bdev; -	int ret; -	int nr = 0;  	size_t pg_offset = 0;  	size_t blocksize; -	loff_t i_size = i_size_read(inode); -	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; -	u64 nr_delalloc; -	u64 delalloc_end; -	int page_started; -	int compressed; -	int write_flags; -	unsigned long nr_written = 0; -	bool fill_delalloc = true; - -	if (wbc->sync_mode == WB_SYNC_ALL) -		write_flags = WRITE_SYNC; -	else -		write_flags = WRITE; - -	trace___extent_writepage(page, inode, wbc); - -	WARN_ON(!PageLocked(page)); - -	ClearPageError(page); - -	pg_offset = i_size & (PAGE_CACHE_SIZE - 1); -	if (page->index > end_index || -	   (page->index == end_index && !pg_offset)) { -		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); -		unlock_page(page); -		return 0; -	} - -	if (page->index == end_index) { -		char *userpage; - -		userpage = kmap_atomic(page); -		memset(userpage + pg_offset, 0, -		       PAGE_CACHE_SIZE - pg_offset); -		kunmap_atomic(userpage); -		flush_dcache_page(page); -	} -	pg_offset = 0; - -	set_page_extent_mapped(page); - -	if (!tree->ops || !tree->ops->fill_delalloc) -		fill_delalloc = false; - -	delalloc_start = start; -	delalloc_end = 0; -	page_started = 0; -	if (!epd->extent_locked && fill_delalloc) { -		u64 delalloc_to_write = 0; -		/* -		 * make sure the wbc mapping index is at least updated -		 * to this page. -		 */ -		update_nr_written(page, wbc, 0); - -		while (delalloc_end < page_end) { -			nr_delalloc = find_lock_delalloc_range(inode, tree, -						       page, -						       &delalloc_start, -						       &delalloc_end, -						       128 * 1024 * 1024); -			if (nr_delalloc == 0) { -				delalloc_start = delalloc_end + 1; -				continue; -			} -			ret = tree->ops->fill_delalloc(inode, page, -						       delalloc_start, -						       delalloc_end, -						       &page_started, -						       &nr_written); -			/* File system has been set read-only */ -			if (ret) { -				SetPageError(page); -				goto done; -			} -			/* -			 * delalloc_end is already one less than the total -			 * length, so we don't subtract one from -			 * PAGE_CACHE_SIZE -			 */ -			delalloc_to_write += (delalloc_end - delalloc_start + -					      PAGE_CACHE_SIZE) >> -					      PAGE_CACHE_SHIFT; -			delalloc_start = delalloc_end + 1; -		} -		if (wbc->nr_to_write < delalloc_to_write) { -			int thresh = 8192; - -			if (delalloc_to_write < thresh * 2) -				thresh = delalloc_to_write; -			wbc->nr_to_write = min_t(u64, delalloc_to_write, -						 thresh); -		} +	int ret = 0; +	int nr = 0; +	bool compressed; -		/* did the fill delalloc function already unlock and start -		 * the IO? -		 */ -		if (page_started) { -			ret = 0; -			/* -			 * we've unlocked the page, so we can't update -			 * the mapping's writeback index, just update -			 * nr_to_write. -			 */ -			wbc->nr_to_write -= nr_written; -			goto done_unlocked; -		} -	}  	if (tree->ops && tree->ops->writepage_start_hook) {  		ret = tree->ops->writepage_start_hook(page, start,  						      page_end); @@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  				wbc->pages_skipped++;  			else  				redirty_page_for_writepage(wbc, page); +  			update_nr_written(page, wbc, nr_written);  			unlock_page(page); -			ret = 0; +			ret = 1;  			goto done_unlocked;  		}  	} @@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	update_nr_written(page, wbc, nr_written + 1);  	end = page_end; -	if (last_byte <= start) { +	if (i_size <= start) {  		if (tree->ops && tree->ops->writepage_end_io_hook)  			tree->ops->writepage_end_io_hook(page, start,  							 page_end, NULL, 1); @@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	blocksize = inode->i_sb->s_blocksize;  	while (cur <= end) { -		if (cur >= last_byte) { +		u64 em_end; +		if (cur >= i_size) {  			if (tree->ops && tree->ops->writepage_end_io_hook)  				tree->ops->writepage_end_io_hook(page, cur,  							 page_end, NULL, 1); @@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  				     end - cur + 1, 1);  		if (IS_ERR_OR_NULL(em)) {  			SetPageError(page); +			ret = PTR_ERR_OR_ZERO(em);  			break;  		}  		extent_offset = cur - em->start; -		BUG_ON(extent_map_end(em) <= cur); +		em_end = extent_map_end(em); +		BUG_ON(em_end <= cur);  		BUG_ON(end < cur); -		iosize = min(extent_map_end(em) - cur, end - cur + 1); +		iosize = min(em_end - cur, end - cur + 1);  		iosize = ALIGN(iosize, blocksize);  		sector = (em->block_start + extent_offset) >> 9;  		bdev = em->bdev; @@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  			pg_offset += iosize;  			continue;  		} -		/* leave this out until we have a page_mkwrite call */ -		if (0 && !test_range_bit(tree, cur, cur + iosize - 1, -				   EXTENT_DIRTY, 0, NULL)) { -			cur = cur + iosize; -			pg_offset += iosize; -			continue; -		}  		if (tree->ops && tree->ops->writepage_io_hook) {  			ret = tree->ops->writepage_io_hook(page, cur, @@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		if (ret) {  			SetPageError(page);  		} else { -			unsigned long max_nr = end_index + 1; +			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;  			set_range_writeback(tree, cur, cur + iosize - 1);  			if (!PageWriteback(page)) { @@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  		nr++;  	}  done: +	*nr_ret = nr; + +done_unlocked: + +	/* drop our reference on any cached states */ +	free_extent_state(cached_state); +	return ret; +} + +/* + * the writepage semantics are similar to regular writepage.  extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback.  Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, +			      void *data) +{ +	struct inode *inode = page->mapping->host; +	struct extent_page_data *epd = data; +	u64 start = page_offset(page); +	u64 page_end = start + PAGE_CACHE_SIZE - 1; +	int ret; +	int nr = 0; +	size_t pg_offset = 0; +	loff_t i_size = i_size_read(inode); +	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; +	int write_flags; +	unsigned long nr_written = 0; + +	if (wbc->sync_mode == WB_SYNC_ALL) +		write_flags = WRITE_SYNC; +	else +		write_flags = WRITE; + +	trace___extent_writepage(page, inode, wbc); + +	WARN_ON(!PageLocked(page)); + +	ClearPageError(page); + +	pg_offset = i_size & (PAGE_CACHE_SIZE - 1); +	if (page->index > end_index || +	   (page->index == end_index && !pg_offset)) { +		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); +		unlock_page(page); +		return 0; +	} + +	if (page->index == end_index) { +		char *userpage; + +		userpage = kmap_atomic(page); +		memset(userpage + pg_offset, 0, +		       PAGE_CACHE_SIZE - pg_offset); +		kunmap_atomic(userpage); +		flush_dcache_page(page); +	} + +	pg_offset = 0; + +	set_page_extent_mapped(page); + +	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); +	if (ret == 1) +		goto done_unlocked; +	if (ret) +		goto done; + +	ret = __extent_writepage_io(inode, page, wbc, epd, +				    i_size, nr_written, write_flags, &nr); +	if (ret == 1) +		goto done_unlocked; + +done:  	if (nr == 0) {  		/* make sure the mapping tag for page dirty gets cleared */  		set_page_writeback(page);  		end_page_writeback(page);  	} +	if (PageError(page)) { +		ret = ret < 0 ? ret : -EIO; +		end_extent_writepage(page, ret, start, page_end); +	}  	unlock_page(page); +	return ret;  done_unlocked: - -	/* drop our reference on any cached states */ -	free_extent_state(cached_state);  	return 0;  } @@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)  		    TASK_UNINTERRUPTIBLE);  } -static int lock_extent_buffer_for_io(struct extent_buffer *eb, -				     struct btrfs_fs_info *fs_info, -				     struct extent_page_data *epd) +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, +			  struct btrfs_fs_info *fs_info, +			  struct extent_page_data *epd)  {  	unsigned long i, num_pages;  	int flush = 0; @@ -3458,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,  static void end_extent_buffer_writeback(struct extent_buffer *eb)  {  	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);  } @@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)  	bio_put(bio);  } -static int write_one_eb(struct extent_buffer *eb, +static noinline_for_stack int write_one_eb(struct extent_buffer *eb,  			struct btrfs_fs_info *fs_info,  			struct writeback_control *wbc,  			struct extent_page_data *epd) @@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,  	struct inode *inode = mapping->host;  	int ret = 0;  	int done = 0; +	int err = 0;  	int nr_to_write_done = 0;  	struct pagevec pvec;  	int nr_pages; @@ -3776,8 +3842,8 @@ retry:  				unlock_page(page);  				ret = 0;  			} -			if (ret) -				done = 1; +			if (!err && ret < 0) +				err = ret;  			/*  			 * the filesystem may choose to bump up nr_to_write. @@ -3789,7 +3855,7 @@ retry:  		pagevec_release(&pvec);  		cond_resched();  	} -	if (!scanned && !done) { +	if (!scanned && !done && !err) {  		/*  		 * We hit the last page and there is more work to be done: wrap  		 * back to the start of the file @@ -3799,7 +3865,7 @@ retry:  		goto retry;  	}  	btrfs_add_delayed_iput(inode); -	return ret; +	return err;  }  static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4510,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)  	spin_unlock(&eb->refs_lock);  } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, +		struct page *accessed)  {  	unsigned long num_pages, i; @@ -4519,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)  	num_pages = num_extent_pages(eb->start, eb->len);  	for (i = 0; i < num_pages; i++) {  		struct page *p = extent_buffer_page(eb, i); -		mark_page_accessed(p); +		if (p != accessed) +			mark_page_accessed(p);  	}  } @@ -4533,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,  			       start >> PAGE_CACHE_SHIFT);  	if (eb && atomic_inc_not_zero(&eb->refs)) {  		rcu_read_unlock(); -		mark_extent_buffer_accessed(eb); +		mark_extent_buffer_accessed(eb, NULL);  		return eb;  	}  	rcu_read_unlock(); @@ -4541,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,  	return NULL;  } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, +					       u64 start, unsigned long len) +{ +	struct extent_buffer *eb, *exists = NULL; +	int ret; + +	eb = find_extent_buffer(fs_info, start); +	if (eb) +		return eb; +	eb = alloc_dummy_extent_buffer(start, len); +	if (!eb) +		return NULL; +	eb->fs_info = fs_info; +again: +	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); +	if (ret) +		goto free_eb; +	spin_lock(&fs_info->buffer_lock); +	ret = radix_tree_insert(&fs_info->buffer_radix, +				start >> PAGE_CACHE_SHIFT, eb); +	spin_unlock(&fs_info->buffer_lock); +	radix_tree_preload_end(); +	if (ret == -EEXIST) { +		exists = find_extent_buffer(fs_info, start); +		if (exists) +			goto free_eb; +		else +			goto again; +	} +	check_buffer_tree_ref(eb); +	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + +	/* +	 * We will free dummy extent buffer's if they come into +	 * free_extent_buffer with a ref count of 2, but if we are using this we +	 * want the buffers to stay in memory until we're done with them, so +	 * bump the ref count again. +	 */ +	atomic_inc(&eb->refs); +	return eb; +free_eb: +	btrfs_release_extent_buffer(eb); +	return exists; +} +#endif +  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,  					  u64 start, unsigned long len)  { @@ -4581,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,  				spin_unlock(&mapping->private_lock);  				unlock_page(p);  				page_cache_release(p); -				mark_extent_buffer_accessed(exists); +				mark_extent_buffer_accessed(exists, p);  				goto free_eb;  			} @@ -4596,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,  		attach_extent_buffer_page(eb, p);  		spin_unlock(&mapping->private_lock);  		WARN_ON(PageDirty(p)); -		mark_page_accessed(p);  		eb->pages[i] = p;  		if (!PageUptodate(p))  			uptodate = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c488b45237bf..8b63f2d46518 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -350,5 +350,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,  				      struct extent_io_tree *tree,  				      struct page *locked_page, u64 *start,  				      u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, +					       u64 start, unsigned long len);  #endif  #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 127555b29f58..f46cfe45d686 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,  found:  		csum += count * csum_size;  		nblocks -= count; +		bio_index += count;  		while (count--) {  			disk_bytenr += bvec->bv_len;  			offset += bvec->bv_len; -			bio_index++;  			bvec++;  		}  	} @@ -750,7 +750,7 @@ again:  		int slot = path->slots[0] + 1;  		/* we didn't find a csum item, insert one */  		nritems = btrfs_header_nritems(path->nodes[0]); -		if (path->slots[0] >= nritems - 1) { +		if (!nritems || (path->slots[0] >= nritems - 1)) {  			ret = btrfs_next_leaf(root, path);  			if (ret == 1)  				found_next = 1; @@ -885,3 +885,79 @@ out:  fail_unlock:  	goto out;  } + +void btrfs_extent_item_to_extent_map(struct inode *inode, +				     const struct btrfs_path *path, +				     struct btrfs_file_extent_item *fi, +				     const bool new_inline, +				     struct extent_map *em) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_buffer *leaf = path->nodes[0]; +	const int slot = path->slots[0]; +	struct btrfs_key key; +	u64 extent_start, extent_end; +	u64 bytenr; +	u8 type = btrfs_file_extent_type(leaf, fi); +	int compress_type = btrfs_file_extent_compression(leaf, fi); + +	em->bdev = root->fs_info->fs_devices->latest_bdev; +	btrfs_item_key_to_cpu(leaf, &key, slot); +	extent_start = key.offset; + +	if (type == BTRFS_FILE_EXTENT_REG || +	    type == BTRFS_FILE_EXTENT_PREALLOC) { +		extent_end = extent_start + +			btrfs_file_extent_num_bytes(leaf, fi); +	} else if (type == BTRFS_FILE_EXTENT_INLINE) { +		size_t size; +		size = btrfs_file_extent_inline_len(leaf, slot, fi); +		extent_end = ALIGN(extent_start + size, root->sectorsize); +	} + +	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); +	if (type == BTRFS_FILE_EXTENT_REG || +	    type == BTRFS_FILE_EXTENT_PREALLOC) { +		em->start = extent_start; +		em->len = extent_end - extent_start; +		em->orig_start = extent_start - +			btrfs_file_extent_offset(leaf, fi); +		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); +		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); +		if (bytenr == 0) { +			em->block_start = EXTENT_MAP_HOLE; +			return; +		} +		if (compress_type != BTRFS_COMPRESS_NONE) { +			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +			em->compress_type = compress_type; +			em->block_start = bytenr; +			em->block_len = em->orig_block_len; +		} else { +			bytenr += btrfs_file_extent_offset(leaf, fi); +			em->block_start = bytenr; +			em->block_len = em->len; +			if (type == BTRFS_FILE_EXTENT_PREALLOC) +				set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		} +	} else if (type == BTRFS_FILE_EXTENT_INLINE) { +		em->block_start = EXTENT_MAP_INLINE; +		em->start = extent_start; +		em->len = extent_end - extent_start; +		/* +		 * Initialize orig_start and block_len with the same values +		 * as in inode.c:btrfs_get_extent(). +		 */ +		em->orig_start = EXTENT_MAP_HOLE; +		em->block_len = (u64)-1; +		if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { +			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +			em->compress_type = compress_type; +		} +	} else { +		btrfs_err(root->fs_info, +			  "unknown file extent item type %d, inode %llu, offset %llu, root %llu", +			  type, btrfs_ino(inode), extent_start, +			  root->root_key.objectid); +	} +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ae6af072b635..e472441feb5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -40,6 +40,7 @@  #include "tree-log.h"  #include "locking.h"  #include "volumes.h" +#include "qgroup.h"  static struct kmem_cache *btrfs_inode_defrag_cachep;  /* @@ -470,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)  	for (i = 0; i < num_pages; i++) {  		/* page checked is some magic around finding pages that  		 * have been modified without going through btrfs_set_page_dirty -		 * clear it here +		 * clear it here. There should be no need to mark the pages +		 * accessed as prepare_pages should have marked them accessed +		 * in prepare_pages via find_or_create_page()  		 */  		ClearPageChecked(pages[i]);  		unlock_page(pages[i]); -		mark_page_accessed(pages[i]);  		page_cache_release(pages[i]);  	}  } @@ -714,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,  	int recow;  	int ret;  	int modify_tree = -1; -	int update_refs = (root->ref_cows || root == root->fs_info->tree_root); +	int update_refs;  	int found = 0;  	int leafs_visited = 0; @@ -724,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,  	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)  		modify_tree = 0; +	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || +		       root == root->fs_info->tree_root);  	while (1) {  		recow = 0;  		ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -780,6 +784,18 @@ next_slot:  			extent_end = search_start;  		} +		/* +		 * Don't skip extent items representing 0 byte lengths. They +		 * used to be created (bug) if while punching holes we hit +		 * -ENOSPC condition. So if we find one here, just ensure we +		 * delete it, otherwise we would insert a new file extent item +		 * with the same key (offset) as that 0 bytes length file +		 * extent item in the call to setup_items_for_insert() later +		 * in this function. +		 */ +		if (extent_end == key.offset && extent_end >= search_start) +			goto delete_extent_item; +  		if (extent_end <= search_start) {  			path->slots[0]++;  			goto next_slot; @@ -835,7 +851,7 @@ next_slot:  						disk_bytenr, num_bytes, 0,  						root->root_key.objectid,  						new_key.objectid, -						start - extent_offset, 0); +						start - extent_offset, 1);  				BUG_ON(ret); /* -ENOMEM */  			}  			key.offset = start; @@ -893,6 +909,7 @@ next_slot:  		 *    | ------ extent ------ |  		 */  		if (start <= key.offset && end >= extent_end) { +delete_extent_item:  			if (del_nr == 0) {  				del_slot = path->slots[0];  				del_nr = 1; @@ -1191,7 +1208,7 @@ again:  		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,  					   root->root_key.objectid, -					   ino, orig_offset, 0); +					   ino, orig_offset, 1);  		BUG_ON(ret); /* -ENOMEM */  		if (split == start) { @@ -2009,8 +2026,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  		if (!full_sync) {  			ret = btrfs_wait_ordered_range(inode, start,  						       end - start + 1); -			if (ret) +			if (ret) { +				btrfs_end_transaction(trans, root);  				goto out; +			}  		}  		ret = btrfs_commit_transaction(trans, root);  	} else { @@ -2168,6 +2187,37 @@ out:  	return 0;  } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + *	   em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ +	struct extent_map *em; +	int ret = 0; + +	em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); +	if (IS_ERR_OR_NULL(em)) { +		if (!em) +			ret = -ENOMEM; +		else +			ret = PTR_ERR(em); +		return ret; +	} + +	/* Hole or vacuum extent(only exists in no-hole mode) */ +	if (em->block_start == EXTENT_MAP_HOLE) { +		ret = 1; +		*len = em->start + em->len > *start + *len ? +		       0 : *start + *len - em->start - em->len; +		*start = em->start + em->len; +	} +	free_extent_map(em); +	return ret; +} +  static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,25 +2225,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  	struct btrfs_path *path;  	struct btrfs_block_rsv *rsv;  	struct btrfs_trans_handle *trans; -	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); -	u64 lockend = round_down(offset + len, -				 BTRFS_I(inode)->root->sectorsize) - 1; -	u64 cur_offset = lockstart; +	u64 lockstart; +	u64 lockend; +	u64 tail_start; +	u64 tail_len; +	u64 orig_start = offset; +	u64 cur_offset;  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);  	u64 drop_end;  	int ret = 0;  	int err = 0;  	int rsv_count; -	bool same_page = ((offset >> PAGE_CACHE_SHIFT) == -			  ((offset + len - 1) >> PAGE_CACHE_SHIFT)); +	bool same_page;  	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); -	u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); +	u64 ino_size;  	ret = btrfs_wait_ordered_range(inode, offset, len);  	if (ret)  		return ret;  	mutex_lock(&inode->i_mutex); +	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); +	ret = find_first_non_hole(inode, &offset, &len); +	if (ret < 0) +		goto out_only_mutex; +	if (ret && !len) { +		/* Already in a large hole */ +		ret = 0; +		goto out_only_mutex; +	} + +	lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); +	lockend = round_down(offset + len, +			     BTRFS_I(inode)->root->sectorsize) - 1; +	same_page = ((offset >> PAGE_CACHE_SHIFT) == +		    ((offset + len - 1) >> PAGE_CACHE_SHIFT)); +  	/*  	 * We needn't truncate any page which is beyond the end of the file  	 * because we are sure there is no data there. @@ -2205,8 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  	if (same_page && len < PAGE_CACHE_SIZE) {  		if (offset < ino_size)  			ret = btrfs_truncate_page(inode, offset, len, 0); -		mutex_unlock(&inode->i_mutex); -		return ret; +		goto out_only_mutex;  	}  	/* zero back part of the first page */ @@ -2218,12 +2284,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  		}  	} -	/* zero the front end of the last page */ -	if (offset + len < ino_size) { -		ret = btrfs_truncate_page(inode, offset + len, 0, 1); -		if (ret) { -			mutex_unlock(&inode->i_mutex); -			return ret; +	/* Check the aligned pages after the first unaligned page, +	 * if offset != orig_start, which means the first unaligned page +	 * including serveral following pages are already in holes, +	 * the extra check can be skipped */ +	if (offset == orig_start) { +		/* after truncate page, check hole again */ +		len = offset + len - lockstart; +		offset = lockstart; +		ret = find_first_non_hole(inode, &offset, &len); +		if (ret < 0) +			goto out_only_mutex; +		if (ret && !len) { +			ret = 0; +			goto out_only_mutex; +		} +		lockstart = offset; +	} + +	/* Check the tail unaligned part is in a hole */ +	tail_start = lockend + 1; +	tail_len = offset + len - tail_start; +	if (tail_len) { +		ret = find_first_non_hole(inode, &tail_start, &tail_len); +		if (unlikely(ret < 0)) +			goto out_only_mutex; +		if (!ret) { +			/* zero the front end of the last page */ +			if (tail_start + tail_len < ino_size) { +				ret = btrfs_truncate_page(inode, +						tail_start + tail_len, 0, 1); +				if (ret) +					goto out_only_mutex; +				}  		}  	} @@ -2249,9 +2342,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  		if ((!ordered ||  		    (ordered->file_offset + ordered->len <= lockstart ||  		     ordered->file_offset > lockend)) && -		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, -				     lockend, EXTENT_UPTODATE, 0, -				     cached_state)) { +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {  			if (ordered)  				btrfs_put_ordered_extent(ordered);  			break; @@ -2299,6 +2390,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  	BUG_ON(ret);  	trans->block_rsv = rsv; +	cur_offset = lockstart; +	len = lockend - cur_offset;  	while (cur_offset < lockend) {  		ret = __btrfs_drop_extents(trans, root, inode, path,  					   cur_offset, lockend + 1, @@ -2339,6 +2432,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  					      rsv, min_size);  		BUG_ON(ret);	/* shouldn't happen */  		trans->block_rsv = rsv; + +		ret = find_first_non_hole(inode, &cur_offset, &len); +		if (unlikely(ret < 0)) +			break; +		if (ret && !len) { +			ret = 0; +			break; +		}  	}  	if (ret) { @@ -2347,7 +2448,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  	}  	trans->block_rsv = &root->fs_info->trans_block_rsv; -	if (cur_offset < ino_size) { +	/* +	 * Don't insert file hole extent item if it's for a range beyond eof +	 * (because it's useless) or if it represents a 0 bytes range (when +	 * cur_offset == drop_end). +	 */ +	if (cur_offset < ino_size && cur_offset < drop_end) {  		ret = fill_holes(trans, inode, path, cur_offset, drop_end);  		if (ret) {  			err = ret; @@ -2372,6 +2478,7 @@ out_free:  out:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,  			     &cached_state, GFP_NOFS); +out_only_mutex:  	mutex_unlock(&inode->i_mutex);  	if (ret && !err)  		err = ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 73f3de7a083c..372b05ff1943 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -831,7 +831,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,  	if (!matched) {  		__btrfs_remove_free_space_cache(ctl); -		btrfs_err(fs_info, "block group %llu has wrong amount of free space", +		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",  			block_group->key.objectid);  		ret = -1;  	} @@ -843,7 +843,7 @@ out:  		spin_unlock(&block_group->lock);  		ret = 0; -		btrfs_err(fs_info, "failed to load free space cache for block group %llu", +		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",  			block_group->key.objectid);  	} @@ -851,90 +851,44 @@ out:  	return ret;  } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount.  This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, -				   struct btrfs_free_space_ctl *ctl, -				   struct btrfs_block_group_cache *block_group, -				   struct btrfs_trans_handle *trans, -				   struct btrfs_path *path, u64 offset) +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, +			      struct btrfs_free_space_ctl *ctl, +			      struct btrfs_block_group_cache *block_group, +			      int *entries, int *bitmaps, +			      struct list_head *bitmap_list)  { -	struct btrfs_free_space_header *header; -	struct extent_buffer *leaf; -	struct rb_node *node; -	struct list_head *pos, *n; -	struct extent_state *cached_state = NULL; -	struct btrfs_free_cluster *cluster = NULL; -	struct extent_io_tree *unpin = NULL; -	struct io_ctl io_ctl; -	struct list_head bitmap_list; -	struct btrfs_key key; -	u64 start, extent_start, extent_end, len; -	int entries = 0; -	int bitmaps = 0;  	int ret; -	int err = -1; - -	INIT_LIST_HEAD(&bitmap_list); - -	if (!i_size_read(inode)) -		return -1; - -	ret = io_ctl_init(&io_ctl, inode, root); -	if (ret) -		return -1; +	struct btrfs_free_cluster *cluster = NULL; +	struct rb_node *node = rb_first(&ctl->free_space_offset);  	/* Get the cluster for this block_group if it exists */ -	if (block_group && !list_empty(&block_group->cluster_list)) +	if (block_group && !list_empty(&block_group->cluster_list)) {  		cluster = list_entry(block_group->cluster_list.next,  				     struct btrfs_free_cluster,  				     block_group_list); +	} -	/* Lock all pages first so we can lock the extent safely. */ -	io_ctl_prepare_pages(&io_ctl, inode, 0); - -	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, -			 0, &cached_state); - -	node = rb_first(&ctl->free_space_offset);  	if (!node && cluster) {  		node = rb_first(&cluster->root);  		cluster = NULL;  	} -	/* Make sure we can fit our crcs into the first page */ -	if (io_ctl.check_crcs && -	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) -		goto out_nospc; - -	io_ctl_set_generation(&io_ctl, trans->transid); -  	/* Write out the extent entries */  	while (node) {  		struct btrfs_free_space *e;  		e = rb_entry(node, struct btrfs_free_space, offset_index); -		entries++; +		*entries += 1; -		ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, +		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,  				       e->bitmap);  		if (ret) -			goto out_nospc; +			goto fail;  		if (e->bitmap) { -			list_add_tail(&e->list, &bitmap_list); -			bitmaps++; +			list_add_tail(&e->list, bitmap_list); +			*bitmaps += 1;  		}  		node = rb_next(node);  		if (!node && cluster) { @@ -942,13 +896,84 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  			cluster = NULL;  		}  	} +	return 0; +fail: +	return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, +		  struct btrfs_root *root, +		  struct inode *inode, +		  struct btrfs_path *path, u64 offset, +		  int entries, int bitmaps) +{ +	struct btrfs_key key; +	struct btrfs_free_space_header *header; +	struct extent_buffer *leaf; +	int ret; + +	key.objectid = BTRFS_FREE_SPACE_OBJECTID; +	key.offset = offset; +	key.type = 0; + +	ret = btrfs_search_slot(trans, root, &key, path, 0, 1); +	if (ret < 0) { +		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, +				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, +				 GFP_NOFS); +		goto fail; +	} +	leaf = path->nodes[0]; +	if (ret > 0) { +		struct btrfs_key found_key; +		ASSERT(path->slots[0]); +		path->slots[0]--; +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); +		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || +		    found_key.offset != offset) { +			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, +					 inode->i_size - 1, +					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, +					 NULL, GFP_NOFS); +			btrfs_release_path(path); +			goto fail; +		} +	} + +	BTRFS_I(inode)->generation = trans->transid; +	header = btrfs_item_ptr(leaf, path->slots[0], +				struct btrfs_free_space_header); +	btrfs_set_free_space_entries(leaf, header, entries); +	btrfs_set_free_space_bitmaps(leaf, header, bitmaps); +	btrfs_set_free_space_generation(leaf, header, trans->transid); +	btrfs_mark_buffer_dirty(leaf); +	btrfs_release_path(path); + +	return 0; + +fail: +	return -1; +} + +static noinline_for_stack int +add_ioctl_entries(struct btrfs_root *root, +		  struct inode *inode, +		  struct btrfs_block_group_cache *block_group, +		  struct io_ctl *io_ctl, +		  struct extent_state **cached_state, +		  struct list_head *bitmap_list, +		  int *entries) +{ +	u64 start, extent_start, extent_end, len; +	struct list_head *pos, *n; +	struct extent_io_tree *unpin = NULL; +	int ret;  	/*  	 * We want to add any pinned extents to our free space cache  	 * so we don't leak the space -	 */ - -	/* +	 *  	 * We shouldn't have switched the pinned extents yet so this is the  	 * right one  	 */ @@ -977,8 +1002,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  				 block_group->key.offset, extent_end + 1);  		len = extent_end - extent_start; -		entries++; -		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); +		*entries += 1; +		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);  		if (ret)  			goto out_nospc; @@ -986,74 +1011,129 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	}  	/* Write out the bitmaps */ -	list_for_each_safe(pos, n, &bitmap_list) { +	list_for_each_safe(pos, n, bitmap_list) {  		struct btrfs_free_space *entry =  			list_entry(pos, struct btrfs_free_space, list); -		ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); +		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);  		if (ret)  			goto out_nospc;  		list_del_init(&entry->list);  	}  	/* Zero out the rest of the pages just to make sure */ -	io_ctl_zero_remaining_pages(&io_ctl); +	io_ctl_zero_remaining_pages(io_ctl); -	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, -				0, i_size_read(inode), &cached_state); -	io_ctl_drop_pages(&io_ctl); +	ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, +				0, i_size_read(inode), cached_state); +	io_ctl_drop_pages(io_ctl);  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, -			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); +			     i_size_read(inode) - 1, cached_state, GFP_NOFS);  	if (ret) -		goto out; +		goto fail;  	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);  	if (ret) {  		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,  				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,  				 GFP_NOFS); -		goto out; +		goto fail;  	} +	return 0; -	key.objectid = BTRFS_FREE_SPACE_OBJECTID; -	key.offset = offset; -	key.type = 0; +fail: +	return -1; -	ret = btrfs_search_slot(trans, root, &key, path, 0, 1); -	if (ret < 0) { -		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, -				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, -				 GFP_NOFS); -		goto out; -	} -	leaf = path->nodes[0]; -	if (ret > 0) { -		struct btrfs_key found_key; -		ASSERT(path->slots[0]); -		path->slots[0]--; -		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || -		    found_key.offset != offset) { -			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, -					 inode->i_size - 1, -					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, -					 NULL, GFP_NOFS); -			btrfs_release_path(path); -			goto out; -		} +out_nospc: +	return -ENOSPC; +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, +			   struct io_ctl *io_ctl, +			   struct extent_state **cached_state, +			   struct list_head *bitmap_list) +{ +	struct list_head *pos, *n; +	list_for_each_safe(pos, n, bitmap_list) { +		struct btrfs_free_space *entry = +			list_entry(pos, struct btrfs_free_space, list); +		list_del_init(&entry->list);  	} +	io_ctl_drop_pages(io_ctl); +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, +			     i_size_read(inode) - 1, cached_state, +			     GFP_NOFS); +} -	BTRFS_I(inode)->generation = trans->transid; -	header = btrfs_item_ptr(leaf, path->slots[0], -				struct btrfs_free_space_header); -	btrfs_set_free_space_entries(leaf, header, entries); -	btrfs_set_free_space_bitmaps(leaf, header, bitmaps); -	btrfs_set_free_space_generation(leaf, header, trans->transid); -	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(path); +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount.  This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +				   struct btrfs_free_space_ctl *ctl, +				   struct btrfs_block_group_cache *block_group, +				   struct btrfs_trans_handle *trans, +				   struct btrfs_path *path, u64 offset) +{ +	struct extent_state *cached_state = NULL; +	struct io_ctl io_ctl; +	struct list_head bitmap_list; +	int entries = 0; +	int bitmaps = 0; +	int ret; +	int err = -1; + +	INIT_LIST_HEAD(&bitmap_list); + +	if (!i_size_read(inode)) +		return -1; + +	ret = io_ctl_init(&io_ctl, inode, root); +	if (ret) +		return -1; + +	/* Lock all pages first so we can lock the extent safely. */ +	io_ctl_prepare_pages(&io_ctl, inode, 0); + +	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, +			 0, &cached_state); + + +	/* Make sure we can fit our crcs into the first page */ +	if (io_ctl.check_crcs && +	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) +		goto out_nospc; + +	io_ctl_set_generation(&io_ctl, trans->transid); + +	ret = write_cache_extent_entries(&io_ctl, ctl, +					 block_group, &entries, &bitmaps, +					 &bitmap_list); +	if (ret) +		goto out_nospc; + +	ret = add_ioctl_entries(root, inode, block_group, &io_ctl, +				&cached_state, &bitmap_list, &entries); + +	if (ret == -ENOSPC) +		goto out_nospc; +	else if (ret) +		goto out; + +	err = update_cache_item(trans, root, inode, path, offset, +				entries, bitmaps); -	err = 0;  out:  	io_ctl_free(&io_ctl);  	if (err) { @@ -1064,14 +1144,8 @@ out:  	return err;  out_nospc: -	list_for_each_safe(pos, n, &bitmap_list) { -		struct btrfs_free_space *entry = -			list_entry(pos, struct btrfs_free_space, list); -		list_del_init(&entry->list); -	} -	io_ctl_drop_pages(&io_ctl); -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, -			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); + +	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);  	goto out;  } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 86935f5ae291..888fbe19079f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root)  				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);  	} -	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", +	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",  			  root->root_key.objectid);  	if (IS_ERR(tsk)) {  		btrfs_warn(root->fs_info, "failed to start inode caching task"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f805bc944fa..7fa5f7fd7bc7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,   * the btree.  The caller should have done a btrfs_drop_extents so that   * no overlapping inline items exist in the btree   */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans,  				struct btrfs_path *path, int extent_inserted,  				struct btrfs_root *root, struct inode *inode,  				u64 start, size_t size, size_t compressed_size, @@ -2678,6 +2678,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  		trans = NULL;  		goto out_unlock;  	} +  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2947,14 +2948,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,  	root->orphan_block_rsv = NULL;  	spin_unlock(&root->orphan_lock); -	if (root->orphan_item_inserted && +	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&  	    btrfs_root_refs(&root->root_item) > 0) {  		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,  					    root->root_key.objectid);  		if (ret)  			btrfs_abort_transaction(trans, root, ret);  		else -			root->orphan_item_inserted = 0; +			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, +				  &root->state);  	}  	if (block_rsv) { @@ -3271,7 +3273,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		btrfs_block_rsv_release(root, root->orphan_block_rsv,  					(u64)-1); -	if (root->orphan_block_rsv || root->orphan_item_inserted) { +	if (root->orphan_block_rsv || +	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {  		trans = btrfs_join_transaction(root);  		if (!IS_ERR(trans))  			btrfs_end_transaction(trans, root); @@ -3473,7 +3476,7 @@ cache_acl:  		ret = btrfs_load_inode_props(inode, path);  		if (ret)  			btrfs_err(root->fs_info, -				  "error loading props for ino %llu (root %llu): %d\n", +				  "error loading props for ino %llu (root %llu): %d",  				  btrfs_ino(inode),  				  root->root_key.objectid, ret);  	} @@ -3998,7 +4001,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	 * not block aligned since we will be keeping the last block of the  	 * extent just the way it is.  	 */ -	if (root->ref_cows || root == root->fs_info->tree_root) +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || +	    root == root->fs_info->tree_root)  		btrfs_drop_extent_cache(inode, ALIGN(new_size,  					root->sectorsize), (u64)-1, 0); @@ -4091,7 +4095,9 @@ search_again:  							 extent_num_bytes);  				num_dec = (orig_num_bytes -  					   extent_num_bytes); -				if (root->ref_cows && extent_start != 0) +				if (test_bit(BTRFS_ROOT_REF_COWS, +					     &root->state) && +				    extent_start != 0)  					inode_sub_bytes(inode, num_dec);  				btrfs_mark_buffer_dirty(leaf);  			} else { @@ -4105,7 +4111,8 @@ search_again:  				num_dec = btrfs_file_extent_num_bytes(leaf, fi);  				if (extent_start != 0) {  					found_extent = 1; -					if (root->ref_cows) +					if (test_bit(BTRFS_ROOT_REF_COWS, +						     &root->state))  						inode_sub_bytes(inode, num_dec);  				}  			} @@ -4120,10 +4127,9 @@ search_again:  			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {  				u32 size = new_size - found_key.offset; -				if (root->ref_cows) { +				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  					inode_sub_bytes(inode, item_end + 1 -  							new_size); -				}  				/*  				 * update the ram bytes to properly reflect @@ -4133,7 +4139,8 @@ search_again:  				size =  				    btrfs_file_extent_calc_inline_size(size);  				btrfs_truncate_item(root, path, size, 1); -			} else if (root->ref_cows) { +			} else if (test_bit(BTRFS_ROOT_REF_COWS, +					    &root->state)) {  				inode_sub_bytes(inode, item_end + 1 -  						found_key.offset);  			} @@ -4155,8 +4162,9 @@ delete:  		} else {  			break;  		} -		if (found_extent && (root->ref_cows || -				     root == root->fs_info->tree_root)) { +		if (found_extent && +		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || +		     root == root->fs_info->tree_root)) {  			btrfs_set_path_blocking(path);  			ret = btrfs_free_extent(trans, root, extent_start,  						extent_num_bytes, 0, @@ -5168,8 +5176,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry)  static void btrfs_dentry_release(struct dentry *dentry)  { -	if (dentry->d_fsdata) -		kfree(dentry->d_fsdata); +	kfree(dentry->d_fsdata);  }  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, @@ -5553,6 +5560,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	struct btrfs_inode_ref *ref;  	struct btrfs_key key[2];  	u32 sizes[2]; +	int nitems = name ? 2 : 1;  	unsigned long ptr;  	int ret; @@ -5572,7 +5580,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	 */  	inode->i_ino = objectid; -	if (dir) { +	if (dir && name) {  		trace_btrfs_inode_request(dir);  		ret = btrfs_set_inode_index(dir, index); @@ -5581,6 +5589,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  			iput(inode);  			return ERR_PTR(ret);  		} +	} else if (dir) { +		*index = 0;  	}  	/*  	 * index_cnt is ignored for everything but a dir, @@ -5605,21 +5615,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);  	key[0].offset = 0; -	/* -	 * Start new inodes with an inode_ref. This is slightly more -	 * efficient for small numbers of hard links since they will -	 * be packed into one item. Extended refs will kick in if we -	 * add more hard links than can fit in the ref item. -	 */ -	key[1].objectid = objectid; -	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); -	key[1].offset = ref_objectid; -  	sizes[0] = sizeof(struct btrfs_inode_item); -	sizes[1] = name_len + sizeof(*ref); + +	if (name) { +		/* +		 * Start new inodes with an inode_ref. This is slightly more +		 * efficient for small numbers of hard links since they will +		 * be packed into one item. Extended refs will kick in if we +		 * add more hard links than can fit in the ref item. +		 */ +		key[1].objectid = objectid; +		btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); +		key[1].offset = ref_objectid; + +		sizes[1] = name_len + sizeof(*ref); +	}  	path->leave_spinning = 1; -	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); +	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);  	if (ret != 0)  		goto fail; @@ -5632,12 +5645,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  			     sizeof(*inode_item));  	fill_inode_item(trans, path->nodes[0], inode_item, inode); -	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, -			     struct btrfs_inode_ref); -	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); -	btrfs_set_inode_ref_index(path->nodes[0], ref, *index); -	ptr = (unsigned long)(ref + 1); -	write_extent_buffer(path->nodes[0], name, ptr, name_len); +	if (name) { +		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, +				     struct btrfs_inode_ref); +		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); +		btrfs_set_inode_ref_index(path->nodes[0], ref, *index); +		ptr = (unsigned long)(ref + 1); +		write_extent_buffer(path->nodes[0], name, ptr, name_len); +	}  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); @@ -5673,7 +5688,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	return inode;  fail: -	if (dir) +	if (dir && name)  		BTRFS_I(dir)->index_cnt--;  	btrfs_free_path(path);  	iput(inode); @@ -5958,6 +5973,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  		err = btrfs_update_inode(trans, root, inode);  		if (err)  			goto fail; +		if (inode->i_nlink == 1) { +			/* +			 * If new hard link count is 1, it's a file created +			 * with open(2) O_TMPFILE flag. +			 */ +			err = btrfs_orphan_del(trans, inode); +			if (err) +				goto fail; +		}  		d_instantiate(dentry, inode);  		btrfs_log_new_name(trans, inode, NULL, parent);  	} @@ -6086,16 +6110,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,  	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);  	ret = btrfs_decompress(compress_type, tmp, page,  			       extent_offset, inline_size, max_size); -	if (ret) { -		char *kaddr = kmap_atomic(page); -		unsigned long copy_size = min_t(u64, -				  PAGE_CACHE_SIZE - pg_offset, -				  max_size - extent_offset); -		memset(kaddr + pg_offset, 0, copy_size); -		kunmap_atomic(kaddr); -	}  	kfree(tmp); -	return 0; +	return ret;  }  /* @@ -6113,7 +6129,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  {  	int ret;  	int err = 0; -	u64 bytenr;  	u64 extent_start = 0;  	u64 extent_end = 0;  	u64 objectid = btrfs_ino(inode); @@ -6127,7 +6142,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_trans_handle *trans = NULL; -	int compress_type; +	const bool new_inline = !page || create;  again:  	read_lock(&em_tree->lock); @@ -6201,7 +6216,6 @@ again:  	found_type = btrfs_file_extent_type(leaf, item);  	extent_start = found_key.offset; -	compress_type = btrfs_file_extent_compression(leaf, item);  	if (found_type == BTRFS_FILE_EXTENT_REG ||  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {  		extent_end = extent_start + @@ -6236,32 +6250,10 @@ next:  		goto not_found_em;  	} -	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); +	btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); +  	if (found_type == BTRFS_FILE_EXTENT_REG ||  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) { -		em->start = extent_start; -		em->len = extent_end - extent_start; -		em->orig_start = extent_start - -				 btrfs_file_extent_offset(leaf, item); -		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, -								      item); -		bytenr = btrfs_file_extent_disk_bytenr(leaf, item); -		if (bytenr == 0) { -			em->block_start = EXTENT_MAP_HOLE; -			goto insert; -		} -		if (compress_type != BTRFS_COMPRESS_NONE) { -			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); -			em->compress_type = compress_type; -			em->block_start = bytenr; -			em->block_len = em->orig_block_len; -		} else { -			bytenr += btrfs_file_extent_offset(leaf, item); -			em->block_start = bytenr; -			em->block_len = em->len; -			if (found_type == BTRFS_FILE_EXTENT_PREALLOC) -				set_bit(EXTENT_FLAG_PREALLOC, &em->flags); -		}  		goto insert;  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		unsigned long ptr; @@ -6270,12 +6262,8 @@ next:  		size_t extent_offset;  		size_t copy_size; -		em->block_start = EXTENT_MAP_INLINE; -		if (!page || create) { -			em->start = extent_start; -			em->len = extent_end - extent_start; +		if (new_inline)  			goto out; -		}  		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);  		extent_offset = page_offset(page) + pg_offset - extent_start; @@ -6285,10 +6273,6 @@ next:  		em->len = ALIGN(copy_size, root->sectorsize);  		em->orig_block_len = em->len;  		em->orig_start = em->start; -		if (compress_type) { -			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); -			em->compress_type = compress_type; -		}  		ptr = btrfs_file_extent_inline_start(item) + extent_offset;  		if (create == 0 && !PageUptodate(page)) {  			if (btrfs_file_extent_compression(leaf, item) != @@ -6296,7 +6280,10 @@ next:  				ret = uncompress_inline(path, inode, page,  							pg_offset,  							extent_offset, item); -				BUG_ON(ret); /* -ENOMEM */ +				if (ret) { +					err = ret; +					goto out; +				}  			} else {  				map = kmap(page);  				read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6332,8 +6319,6 @@ next:  		set_extent_uptodate(io_tree, em->start,  				    extent_map_end(em) - 1, NULL, GFP_NOFS);  		goto insert; -	} else { -		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);  	}  not_found:  	em->start = start; @@ -6717,6 +6702,76 @@ out:  	return ret;  } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ +	struct radix_tree_root *root = &inode->i_mapping->page_tree; +	int found = false; +	void **pagep = NULL; +	struct page *page = NULL; +	int start_idx; +	int end_idx; + +	start_idx = start >> PAGE_CACHE_SHIFT; + +	/* +	 * end is the last byte in the last page.  end == start is legal +	 */ +	end_idx = end >> PAGE_CACHE_SHIFT; + +	rcu_read_lock(); + +	/* Most of the code in this while loop is lifted from +	 * find_get_page.  It's been modified to begin searching from a +	 * page and return just the first page found in that range.  If the +	 * found idx is less than or equal to the end idx then we know that +	 * a page exists.  If no pages are found or if those pages are +	 * outside of the range then we're fine (yay!) */ +	while (page == NULL && +	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { +		page = radix_tree_deref_slot(pagep); +		if (unlikely(!page)) +			break; + +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) { +				page = NULL; +				continue; +			} +			/* +			 * Otherwise, shmem/tmpfs must be storing a swap entry +			 * here as an exceptional entry: so return it without +			 * attempting to raise page count. +			 */ +			page = NULL; +			break; /* TODO: Is this relevant for this use case? */ +		} + +		if (!page_cache_get_speculative(page)) { +			page = NULL; +			continue; +		} + +		/* +		 * Has the page moved? +		 * This is part of the lockless pagecache protocol. See +		 * include/linux/pagemap.h for details. +		 */ +		if (unlikely(page != *pagep)) { +			page_cache_release(page); +			page = NULL; +		} +	} + +	if (page) { +		if (page->index <= end_idx) +			found = true; +		page_cache_release(page); +	} + +	rcu_read_unlock(); +	return found; +} +  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  			      struct extent_state **cached_state, int writing)  { @@ -6741,10 +6796,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  		 * invalidate needs to happen so that reads after a write do not  		 * get stale data.  		 */ -		if (!ordered && (!writing || -		    !test_range_bit(&BTRFS_I(inode)->io_tree, -				    lockstart, lockend, EXTENT_UPTODATE, 0, -				    *cached_state))) +		if (!ordered && +		    (!writing || +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))  			break;  		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -7126,7 +7180,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)  		 * before atomic variable goto zero, we must make sure  		 * dip->errors is perceived to be set.  		 */ -		smp_mb__before_atomic_dec(); +		smp_mb__before_atomic();  	}  	/* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7360,7 @@ out_err:  	 * before atomic variable goto zero, we must  	 * make sure dip->errors is perceived to be set.  	 */ -	smp_mb__before_atomic_dec(); +	smp_mb__before_atomic();  	if (atomic_dec_and_test(&dip->pending_bios))  		bio_io_error(dip->orig_bio); @@ -7449,7 +7503,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,  		return 0;  	atomic_inc(&inode->i_dio_count); -	smp_mb__after_atomic_inc(); +	smp_mb__after_atomic();  	/*  	 * The generic stuff only does filemap_write_and_wait_range, which @@ -7992,7 +8046,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);  	if (err)  		btrfs_err(new_root->fs_info, -			  "error inheriting subvolume %llu properties: %d\n", +			  "error inheriting subvolume %llu properties: %d",  			  new_root->root_key.objectid, err);  	err = btrfs_update_inode(trans, new_root, inode); @@ -8311,7 +8365,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	BTRFS_I(old_inode)->dir_index = 0ULL;  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		/* force full log commit if subvolume involved. */ -		root->fs_info->last_trans_log_full_commit = trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  	} else {  		ret = btrfs_insert_inode_ref(trans, dest,  					     new_dentry->d_name.name, @@ -8889,6 +8943,66 @@ static int btrfs_permission(struct inode *inode, int mask)  	return generic_permission(inode, mask);  } +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = BTRFS_I(dir)->root; +	struct inode *inode = NULL; +	u64 objectid; +	u64 index; +	int ret = 0; + +	/* +	 * 5 units required for adding orphan entry +	 */ +	trans = btrfs_start_transaction(root, 5); +	if (IS_ERR(trans)) +		return PTR_ERR(trans); + +	ret = btrfs_find_free_ino(root, &objectid); +	if (ret) +		goto out; + +	inode = btrfs_new_inode(trans, root, dir, NULL, 0, +				btrfs_ino(dir), objectid, mode, &index); +	if (IS_ERR(inode)) { +		ret = PTR_ERR(inode); +		inode = NULL; +		goto out; +	} + +	ret = btrfs_init_inode_security(trans, inode, dir, NULL); +	if (ret) +		goto out; + +	ret = btrfs_update_inode(trans, root, inode); +	if (ret) +		goto out; + +	inode->i_fop = &btrfs_file_operations; +	inode->i_op = &btrfs_file_inode_operations; + +	inode->i_mapping->a_ops = &btrfs_aops; +	inode->i_mapping->backing_dev_info = &root->fs_info->bdi; +	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + +	ret = btrfs_orphan_add(trans, inode); +	if (ret) +		goto out; + +	d_tmpfile(dentry, inode); +	mark_inode_dirty(inode); + +out: +	btrfs_end_transaction(trans, root); +	if (ret) +		iput(inode); +	btrfs_balance_delayed_items(root); +	btrfs_btree_balance_dirty(root); + +	return ret; +} +  static const struct inode_operations btrfs_dir_inode_operations = {  	.getattr	= btrfs_getattr,  	.lookup		= btrfs_lookup, @@ -8909,6 +9023,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {  	.get_acl	= btrfs_get_acl,  	.set_acl	= btrfs_set_acl,  	.update_time	= btrfs_update_time, +	.tmpfile        = btrfs_tmpfile,  };  static const struct inode_operations btrfs_dir_ro_inode_operations = {  	.lookup		= btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2ad7de94efef..82c18ba12e3f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@  #include "dev-replace.h"  #include "props.h"  #include "sysfs.h" +#include "qgroup.h"  #ifdef CONFIG_64BIT  /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -638,11 +639,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	struct btrfs_trans_handle *trans;  	int ret; -	if (!root->ref_cows) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		return -EINVAL;  	atomic_inc(&root->will_be_snapshoted); -	smp_mb__after_atomic_inc(); +	smp_mb__after_atomic();  	btrfs_wait_nocow_write(root);  	ret = btrfs_start_delalloc_inodes(root, 0); @@ -711,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	if (ret)  		goto fail; +	/* +	 * If orphan cleanup did remove any orphans, it means the tree was +	 * modified and therefore the commit root is not the same as the +	 * current root anymore. This is a problem, because send uses the +	 * commit root and therefore can see inode items that don't exist +	 * in the current root anymore, and for example make calls to +	 * btrfs_iget, which will do tree lookups based on the current root +	 * and not on the commit root. Those lookups will fail, returning a +	 * -ESTALE error, and making send fail with that error. So make sure +	 * a send does not see any orphans we have just removed, and that it +	 * will see the same inodes regardless of whether a transaction +	 * commit happened before it started (meaning that the commit root +	 * will be the same as the current root) or not. +	 */ +	if (readonly && pending_snapshot->snap->node != +	    pending_snapshot->snap->commit_root) { +		trans = btrfs_join_transaction(pending_snapshot->snap); +		if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { +			ret = PTR_ERR(trans); +			goto fail; +		} +		if (!IS_ERR(trans)) { +			ret = btrfs_commit_transaction(trans, +						       pending_snapshot->snap); +			if (ret) +				goto fail; +		} +	} +  	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode); @@ -1502,11 +1532,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	sizestr = vol_args->name;  	devstr = strchr(sizestr, ':');  	if (devstr) { -		char *end;  		sizestr = devstr + 1;  		*devstr = '\0';  		devstr = vol_args->name; -		devid = simple_strtoull(devstr, &end, 10); +		ret = kstrtoull(devstr, 10, &devid); +		if (ret) +			goto out_free;  		if (!devid) {  			ret = -EINVAL;  			goto out_free; @@ -1562,7 +1593,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,  		new_size = old_size - new_size;  	} else if (mod > 0) {  		if (new_size > ULLONG_MAX - old_size) { -			ret = -EINVAL; +			ret = -ERANGE;  			goto out_free;  		}  		new_size = old_size + new_size; @@ -2219,6 +2250,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	struct btrfs_ioctl_vol_args *vol_args;  	struct btrfs_trans_handle *trans;  	struct btrfs_block_rsv block_rsv; +	u64 root_flags;  	u64 qgroup_reserved;  	int namelen;  	int ret; @@ -2240,6 +2272,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	if (err)  		goto out; +  	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);  	if (err == -EINTR)  		goto out_drop_write; @@ -2301,6 +2334,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	}  	mutex_lock(&inode->i_mutex); + +	/* +	 * Don't allow to delete a subvolume with send in progress. This is +	 * inside the i_mutex so the error handling that has to drop the bit +	 * again is not run concurrently. +	 */ +	spin_lock(&dest->root_item_lock); +	root_flags = btrfs_root_flags(&dest->root_item); +	if (dest->send_in_progress == 0) { +		btrfs_set_root_flags(&dest->root_item, +				root_flags | BTRFS_ROOT_SUBVOL_DEAD); +		spin_unlock(&dest->root_item_lock); +	} else { +		spin_unlock(&dest->root_item_lock); +		btrfs_warn(root->fs_info, +			"Attempt to delete subvolume %llu during send", +			dest->root_key.objectid); +		err = -EPERM; +		goto out_dput; +	} +  	err = d_invalidate(dentry);  	if (err)  		goto out_unlock; @@ -2346,7 +2400,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	dest->root_item.drop_level = 0;  	btrfs_set_root_refs(&dest->root_item, 0); -	if (!xchg(&dest->orphan_item_inserted, 1)) { +	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {  		ret = btrfs_insert_orphan_item(trans,  					root->fs_info->tree_root,  					dest->root_key.objectid); @@ -2389,11 +2443,19 @@ out_release:  out_up_write:  	up_write(&root->fs_info->subvol_sem);  out_unlock: +	if (err) { +		spin_lock(&dest->root_item_lock); +		root_flags = btrfs_root_flags(&dest->root_item); +		btrfs_set_root_flags(&dest->root_item, +				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); +		spin_unlock(&dest->root_item_lock); +	}  	mutex_unlock(&inode->i_mutex);  	if (!err) {  		shrink_dcache_sb(root->fs_info->sb);  		btrfs_invalidate_inodes(dest);  		d_delete(dentry); +		ASSERT(dest->send_in_progress == 0);  		/* the last ref */  		if (dest->cache_inode) { @@ -2557,9 +2619,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)  	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;  	int ret = 0; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -  	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);  	if (!fi_args)  		return -ENOMEM; @@ -2574,6 +2633,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)  	}  	mutex_unlock(&fs_devices->device_list_mutex); +	fi_args->nodesize = root->fs_info->super_copy->nodesize; +	fi_args->sectorsize = root->fs_info->super_copy->sectorsize; +	fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; +  	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))  		ret = -EFAULT; @@ -2589,9 +2652,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)  	int ret = 0;  	char *s_uuid = NULL; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -  	di_args = memdup_user(arg, sizeof(*di_args));  	if (IS_ERR(di_args))  		return PTR_ERR(di_args); @@ -2669,10 +2729,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)  		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);  		ordered = btrfs_lookup_first_ordered_extent(inode,  							    off + len - 1); -		if (!ordered && +		if ((!ordered || +		     ordered->file_offset + ordered->len <= off || +		     ordered->file_offset >= off + len) &&  		    !test_range_bit(&BTRFS_I(inode)->io_tree, off, -				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) +				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) { +			if (ordered) +				btrfs_put_ordered_extent(ordered);  			break; +		}  		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);  		if (ordered)  			btrfs_put_ordered_extent(ordered); @@ -2912,6 +2977,126 @@ out:  	return ret;  } +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr.  If it does then we need to update the quota for this root.  This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, +		     u64 disko) +{ +	struct seq_list tree_mod_seq_elem = {}; +	struct ulist *roots; +	struct ulist_iterator uiter; +	struct ulist_node *root_node = NULL; +	int ret; + +	if (!root->fs_info->quota_enabled) +		return 1; + +	btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); +	ret = btrfs_find_all_roots(trans, root->fs_info, disko, +				   tree_mod_seq_elem.seq, &roots); +	if (ret < 0) +		goto out; +	ret = 0; +	ULIST_ITER_INIT(&uiter); +	while ((root_node = ulist_next(roots, &uiter))) { +		if (root_node->val == root->objectid) { +			ret = 1; +			break; +		} +	} +	ulist_free(roots); +out: +	btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); +	return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, +				     struct inode *inode, +				     u64 endoff, +				     const u64 destoff, +				     const u64 olen) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	inode_inc_iversion(inode); +	inode->i_mtime = inode->i_ctime = CURRENT_TIME; +	/* +	 * We round up to the block size at eof when determining which +	 * extents to clone above, but shouldn't round up the file size. +	 */ +	if (endoff > destoff + olen) +		endoff = destoff + olen; +	if (endoff > inode->i_size) +		btrfs_i_size_write(inode, endoff); + +	ret = btrfs_update_inode(trans, root, inode); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		btrfs_end_transaction(trans, root); +		goto out; +	} +	ret = btrfs_end_transaction(trans, root); +out: +	return ret; +} + +static void clone_update_extent_map(struct inode *inode, +				    const struct btrfs_trans_handle *trans, +				    const struct btrfs_path *path, +				    struct btrfs_file_extent_item *fi, +				    const u64 hole_offset, +				    const u64 hole_len) +{ +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct extent_map *em; +	int ret; + +	em = alloc_extent_map(); +	if (!em) { +		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			&BTRFS_I(inode)->runtime_flags); +		return; +	} + +	if (fi) { +		btrfs_extent_item_to_extent_map(inode, path, fi, false, em); +		em->generation = -1; +		if (btrfs_file_extent_type(path->nodes[0], fi) == +		    BTRFS_FILE_EXTENT_INLINE) +			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +				&BTRFS_I(inode)->runtime_flags); +	} else { +		em->start = hole_offset; +		em->len = hole_len; +		em->ram_bytes = em->len; +		em->orig_start = hole_offset; +		em->block_start = EXTENT_MAP_HOLE; +		em->block_len = 0; +		em->orig_block_len = 0; +		em->compress_type = BTRFS_COMPRESS_NONE; +		em->generation = trans->transid; +	} + +	while (1) { +		write_lock(&em_tree->lock); +		ret = add_extent_mapping(em_tree, em, 1); +		write_unlock(&em_tree->lock); +		if (ret != -EEXIST) { +			free_extent_map(em); +			break; +		} +		btrfs_drop_extent_cache(inode, em->start, +					em->start + em->len - 1, 0); +	} + +	if (unlikely(ret)) +		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			&BTRFS_I(inode)->runtime_flags); +} +  /**   * btrfs_clone() - clone a range from inode file to another   * @@ -2924,7 +3109,8 @@ out:   * @destoff: Offset within @inode to start clone   */  static int btrfs_clone(struct inode *src, struct inode *inode, -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff) +		       const u64 off, const u64 olen, const u64 olen_aligned, +		       const u64 destoff)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_path *path = NULL; @@ -2935,7 +3121,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,  	u32 nritems;  	int slot;  	int ret; -	u64 len = olen_aligned; +	int no_quota; +	const u64 len = olen_aligned; +	u64 last_disko = 0; +	u64 last_dest_end = destoff;  	ret = -ENOMEM;  	buf = vmalloc(btrfs_level_size(root, 0)); @@ -2952,7 +3141,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,  	/* clone data */  	key.objectid = btrfs_ino(src);  	key.type = BTRFS_EXTENT_DATA_KEY; -	key.offset = 0; +	key.offset = off;  	while (1) {  		/* @@ -2964,9 +3153,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,  				0, 0);  		if (ret < 0)  			goto out; +		/* +		 * First search, if no extent item that starts at offset off was +		 * found but the previous item is an extent item, it's possible +		 * it might overlap our target range, therefore process it. +		 */ +		if (key.offset == off && ret > 0 && path->slots[0] > 0) { +			btrfs_item_key_to_cpu(path->nodes[0], &key, +					      path->slots[0] - 1); +			if (key.type == BTRFS_EXTENT_DATA_KEY) +				path->slots[0]--; +		}  		nritems = btrfs_header_nritems(path->nodes[0]);  process_slot: +		no_quota = 1;  		if (path->slots[0] >= nritems) {  			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);  			if (ret < 0) @@ -2991,7 +3192,7 @@ process_slot:  			u64 disko = 0, diskl = 0;  			u64 datao = 0, datal = 0;  			u8 comp; -			u64 endoff; +			u64 drop_start;  			extent = btrfs_item_ptr(leaf, slot,  						struct btrfs_file_extent_item); @@ -3012,10 +3213,16 @@ process_slot:  								    extent);  			} -			if (key.offset + datal <= off || -			    key.offset >= off + len - 1) { +			/* +			 * The first search might have left us at an extent +			 * item that ends before our target range's start, can +			 * happen if we have holes and NO_HOLES feature enabled. +			 */ +			if (key.offset + datal <= off) {  				path->slots[0]++;  				goto process_slot; +			} else if (key.offset >= off + len) { +				break;  			}  			size = btrfs_item_size_nr(leaf, slot); @@ -3034,6 +3241,18 @@ process_slot:  				new_key.offset = destoff;  			/* +			 * Deal with a hole that doesn't have an extent item +			 * that represents it (NO_HOLES feature enabled). +			 * This hole is either in the middle of the cloning +			 * range or at the beginning (fully overlaps it or +			 * partially overlaps it). +			 */ +			if (new_key.offset != last_dest_end) +				drop_start = last_dest_end; +			else +				drop_start = new_key.offset; + +			/*  			 * 1 - adjusting old extent (we may have to split it)  			 * 1 - add new extent  			 * 1 - inode update @@ -3051,18 +3270,18 @@ process_slot:  				 * | ------------- extent ------------- |  				 */ -				/* substract range b */ +				/* subtract range b */  				if (key.offset + datal > off + len)  					datal = off + len - key.offset; -				/* substract range a */ +				/* subtract range a */  				if (off > key.offset) {  					datao += off - key.offset;  					datal -= off - key.offset;  				}  				ret = btrfs_drop_extents(trans, root, inode, -							 new_key.offset, +							 drop_start,  							 new_key.offset + datal,  							 1);  				if (ret) { @@ -3099,6 +3318,28 @@ process_slot:  							     datao);  				btrfs_set_file_extent_num_bytes(leaf, extent,  								datal); + +				/* +				 * We need to look up the roots that point at +				 * this bytenr and see if the new root does.  If +				 * it does not we need to make sure we update +				 * quotas appropriately. +				 */ +				if (disko && root != BTRFS_I(src)->root && +				    disko != last_disko) { +					no_quota = check_ref(trans, root, +							     disko); +					if (no_quota < 0) { +						btrfs_abort_transaction(trans, +									root, +									ret); +						btrfs_end_transaction(trans, +								      root); +						ret = no_quota; +						goto out; +					} +				} +  				if (disko) {  					inode_add_bytes(inode, datal);  					ret = btrfs_inc_extent_ref(trans, root, @@ -3106,7 +3347,7 @@ process_slot:  							root->root_key.objectid,  							btrfs_ino(inode),  							new_key.offset - datao, -							0); +							no_quota);  					if (ret) {  						btrfs_abort_transaction(trans,  									root, @@ -3120,6 +3361,8 @@ process_slot:  			} else if (type == BTRFS_FILE_EXTENT_INLINE) {  				u64 skip = 0;  				u64 trim = 0; +				u64 aligned_end = 0; +  				if (off > key.offset) {  					skip = off - key.offset;  					new_key.offset += skip; @@ -3136,9 +3379,11 @@ process_slot:  				size -= skip + trim;  				datal -= skip + trim; +				aligned_end = ALIGN(new_key.offset + datal, +						    root->sectorsize);  				ret = btrfs_drop_extents(trans, root, inode, -							 new_key.offset, -							 new_key.offset + datal, +							 drop_start, +							 aligned_end,  							 1);  				if (ret) {  					if (ret != -EOPNOTSUPP) @@ -3170,40 +3415,69 @@ process_slot:  					    btrfs_item_ptr_offset(leaf, slot),  					    size);  				inode_add_bytes(inode, datal); +				extent = btrfs_item_ptr(leaf, slot, +						struct btrfs_file_extent_item);  			} +			/* If we have an implicit hole (NO_HOLES feature). */ +			if (drop_start < new_key.offset) +				clone_update_extent_map(inode, trans, +						path, NULL, drop_start, +						new_key.offset - drop_start); + +			clone_update_extent_map(inode, trans, path, +						extent, 0, 0); +  			btrfs_mark_buffer_dirty(leaf);  			btrfs_release_path(path); -			inode_inc_iversion(inode); -			inode->i_mtime = inode->i_ctime = CURRENT_TIME; - -			/* -			 * we round up to the block size at eof when -			 * determining which extents to clone above, -			 * but shouldn't round up the file size -			 */ -			endoff = new_key.offset + datal; -			if (endoff > destoff+olen) -				endoff = destoff+olen; -			if (endoff > inode->i_size) -				btrfs_i_size_write(inode, endoff); - -			ret = btrfs_update_inode(trans, root, inode); -			if (ret) { -				btrfs_abort_transaction(trans, root, ret); -				btrfs_end_transaction(trans, root); +			last_dest_end = new_key.offset + datal; +			ret = clone_finish_inode_update(trans, inode, +							last_dest_end, +							destoff, olen); +			if (ret)  				goto out; -			} -			ret = btrfs_end_transaction(trans, root); +			if (new_key.offset + datal >= destoff + len) +				break;  		}  		btrfs_release_path(path);  		key.offset++;  	}  	ret = 0; +	if (last_dest_end < destoff + len) { +		/* +		 * We have an implicit hole (NO_HOLES feature is enabled) that +		 * fully or partially overlaps our cloning range at its end. +		 */ +		btrfs_release_path(path); + +		/* +		 * 1 - remove extent(s) +		 * 1 - inode update +		 */ +		trans = btrfs_start_transaction(root, 2); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			goto out; +		} +		ret = btrfs_drop_extents(trans, root, inode, +					 last_dest_end, destoff + len, 1); +		if (ret) { +			if (ret != -EOPNOTSUPP) +				btrfs_abort_transaction(trans, root, ret); +			btrfs_end_transaction(trans, root); +			goto out; +		} +		ret = clone_finish_inode_update(trans, inode, destoff + len, +						destoff, olen); +		if (ret) +			goto out; +		clone_update_extent_map(inode, trans, path, NULL, last_dest_end, +					destoff + len - last_dest_end); +	} +  out: -	btrfs_release_path(path);  	btrfs_free_path(path);  	vfree(buf);  	return ret; @@ -3315,15 +3589,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,  			goto out_unlock;  	} -	/* truncate page cache pages from target inode range */ -	truncate_inode_pages_range(&inode->i_data, destoff, -				   PAGE_CACHE_ALIGN(destoff + len) - 1); +	/* +	 * Lock the target range too. Right after we replace the file extent +	 * items in the fs tree (which now point to the cloned data), we might +	 * have a worker replace them with extent items relative to a write +	 * operation that was issued before this clone operation (i.e. confront +	 * with inode.c:btrfs_finish_ordered_io). +	 */ +	if (same_inode) { +		u64 lock_start = min_t(u64, off, destoff); +		u64 lock_len = max_t(u64, off, destoff) + len - lock_start; -	lock_extent_range(src, off, len); +		lock_extent_range(src, lock_start, lock_len); +	} else { +		lock_extent_range(src, off, len); +		lock_extent_range(inode, destoff, len); +	}  	ret = btrfs_clone(src, inode, off, olen, len, destoff); -	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); +	if (same_inode) { +		u64 lock_start = min_t(u64, off, destoff); +		u64 lock_end = max_t(u64, off, destoff) + len - 1; + +		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); +	} else { +		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); +		unlock_extent(&BTRFS_I(inode)->io_tree, destoff, +			      destoff + len - 1); +	} +	/* +	 * Truncate page cache pages so that future reads will see the cloned +	 * data immediately and not the previous data. +	 */ +	truncate_inode_pages_range(&inode->i_data, destoff, +				   PAGE_CACHE_ALIGN(destoff + len) - 1);  out_unlock:  	if (!same_inode) {  		if (inode < src) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b47f669aca75..dfad8514f0da 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws,  		if (ret != LZO_E_OK) {  			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",  			       ret); -			ret = -1; +			ret = -EIO;  			goto out;  		} @@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,  				kunmap(out_page);  				if (nr_pages == nr_dest_pages) {  					out_page = NULL; -					ret = -1; +					ret = -E2BIG;  					goto out;  				} @@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,  		/* we're making it bigger, give up */  		if (tot_in > 8192 && tot_in < tot_out) { -			ret = -1; +			ret = -E2BIG;  			goto out;  		} @@ -335,7 +335,7 @@ cont:  					break;  				if (page_in_index + 1 >= total_pages_in) { -					ret = -1; +					ret = -EIO;  					goto done;  				} @@ -358,7 +358,7 @@ cont:  			kunmap(pages_in[page_in_index - 1]);  		if (ret != LZO_E_OK) {  			printk(KERN_WARNING "BTRFS: decompress failed\n"); -			ret = -1; +			ret = -EIO;  			break;  		} @@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,  	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);  	if (ret != LZO_E_OK) {  		printk(KERN_WARNING "BTRFS: decompress failed!\n"); -		ret = -1; +		ret = -EIO;  		goto out;  	}  	if (out_len < start_byte) { -		ret = -1; +		ret = -EIO;  		goto out;  	} diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a94b05f72869..e12441c7cf1d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,  {  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " -		    "%llu\n", offset); +		    "%llu", offset);  }  /* diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2cf905877aaf..cf5aead95a7f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@  #include "ulist.h"  #include "backref.h"  #include "extent_io.h" +#include "qgroup.h"  /* TODO XXX FIXME   *  - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup {  	/*  	 * temp variables for accounting operations  	 */ -	u64 tag; -	u64 refcnt; +	u64 old_refcnt; +	u64 new_refcnt;  };  /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list {  	struct btrfs_qgroup *member;  }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) +  static int  qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,  		   int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,  	return -ENOENT;  } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +			       u64 rfer, u64 excl) +{ +	struct btrfs_qgroup *qgroup; + +	qgroup = find_qgroup_rb(fs_info, qgroupid); +	if (!qgroup) +		return -EINVAL; +	if (qgroup->rfer != rfer || qgroup->excl != excl) +		return -EINVAL; +	return 0; +} +#endif +  /*   * The full config is read in one go, only called from open_ctree()   * It doesn't use any locking, as at this point we're still single-threaded @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,  	struct extent_buffer *leaf;  	struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) +		return 0; +#endif  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,  	int ret;  	int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	key.objectid = 0;  	key.type = BTRFS_QGROUP_INFO_KEY;  	key.offset = qgroup->qgroupid; @@ -1174,33 +1201,198 @@ out:  	mutex_unlock(&fs_info->qgroup_ioctl_lock);  	return ret;  } +static int comp_oper(struct btrfs_qgroup_operation *oper1, +		     struct btrfs_qgroup_operation *oper2) +{ +	if (oper1->bytenr < oper2->bytenr) +		return -1; +	if (oper1->bytenr > oper2->bytenr) +		return 1; +	if (oper1->seq < oper2->seq) +		return -1; +	if (oper1->seq > oper2->seq) +		return -1; +	if (oper1->ref_root < oper2->ref_root) +		return -1; +	if (oper1->ref_root > oper2->ref_root) +		return 1; +	if (oper1->type < oper2->type) +		return -1; +	if (oper1->type > oper2->type) +		return 1; +	return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, +			      struct btrfs_qgroup_operation *oper) +{ +	struct rb_node **p; +	struct rb_node *parent = NULL; +	struct btrfs_qgroup_operation *cur; +	int cmp; + +	spin_lock(&fs_info->qgroup_op_lock); +	p = &fs_info->qgroup_op_tree.rb_node; +	while (*p) { +		parent = *p; +		cur = rb_entry(parent, struct btrfs_qgroup_operation, n); +		cmp = comp_oper(cur, oper); +		if (cmp < 0) { +			p = &(*p)->rb_right; +		} else if (cmp) { +			p = &(*p)->rb_left; +		} else { +			spin_unlock(&fs_info->qgroup_op_lock); +			return -EEXIST; +		} +	} +	rb_link_node(&oper->n, parent, p); +	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); +	spin_unlock(&fs_info->qgroup_op_lock); +	return 0; +}  /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point.  If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK.   */  int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, -			    struct btrfs_delayed_ref_node *node, -			    struct btrfs_delayed_extent_op *extent_op) +			    struct btrfs_fs_info *fs_info, u64 ref_root, +			    u64 bytenr, u64 num_bytes, +			    enum btrfs_qgroup_operation_type type, int mod_seq)  { -	struct qgroup_update *u; +	struct btrfs_qgroup_operation *oper; +	int ret; + +	if (!is_fstree(ref_root) || !fs_info->quota_enabled) +		return 0; -	BUG_ON(!trans->delayed_ref_elem.seq); -	u = kmalloc(sizeof(*u), GFP_NOFS); -	if (!u) +	oper = kmalloc(sizeof(*oper), GFP_NOFS); +	if (!oper)  		return -ENOMEM; -	u->node = node; -	u->extent_op = extent_op; -	list_add_tail(&u->list, &trans->qgroup_ref_list); +	oper->ref_root = ref_root; +	oper->bytenr = bytenr; +	oper->num_bytes = num_bytes; +	oper->type = type; +	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); +	INIT_LIST_HEAD(&oper->elem.list); +	oper->elem.seq = 0; +	ret = insert_qgroup_oper(fs_info, oper); +	if (ret) { +		/* Shouldn't happen so have an assert for developers */ +		ASSERT(0); +		kfree(oper); +		return ret; +	} +	list_add_tail(&oper->list, &trans->qgroup_ref_list); + +	if (mod_seq) +		btrfs_get_tree_mod_seq(fs_info, &oper->elem);  	return 0;  } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, -				    struct ulist *roots, struct ulist *tmp, -				    u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, +				  struct btrfs_qgroup_operation *oper) +{ +	struct btrfs_qgroup *qgroup; +	struct ulist *tmp; +	struct btrfs_qgroup_list *glist; +	struct ulist_node *unode; +	struct ulist_iterator uiter; +	int sign = 0; +	int ret = 0; + +	tmp = ulist_alloc(GFP_NOFS); +	if (!tmp) +		return -ENOMEM; + +	spin_lock(&fs_info->qgroup_lock); +	if (!fs_info->quota_root) +		goto out; +	qgroup = find_qgroup_rb(fs_info, oper->ref_root); +	if (!qgroup) +		goto out; +	switch (oper->type) { +	case BTRFS_QGROUP_OPER_ADD_EXCL: +		sign = 1; +		break; +	case BTRFS_QGROUP_OPER_SUB_EXCL: +		sign = -1; +		break; +	default: +		ASSERT(0); +	} +	qgroup->rfer += sign * oper->num_bytes; +	qgroup->rfer_cmpr += sign * oper->num_bytes; + +	WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); +	qgroup->excl += sign * oper->num_bytes; +	qgroup->excl_cmpr += sign * oper->num_bytes; + +	qgroup_dirty(fs_info, qgroup); + +	/* Get all of the parent groups that contain this qgroup */ +	list_for_each_entry(glist, &qgroup->groups, next_group) { +		ret = ulist_add(tmp, glist->group->qgroupid, +				ptr_to_u64(glist->group), GFP_ATOMIC); +		if (ret < 0) +			goto out; +	} + +	/* Iterate all of the parents and adjust their reference counts */ +	ULIST_ITER_INIT(&uiter); +	while ((unode = ulist_next(tmp, &uiter))) { +		qgroup = u64_to_ptr(unode->aux); +		qgroup->rfer += sign * oper->num_bytes; +		qgroup->rfer_cmpr += sign * oper->num_bytes; +		qgroup->excl += sign * oper->num_bytes; +		if (sign < 0) +			WARN_ON(qgroup->excl < oper->num_bytes); +		qgroup->excl_cmpr += sign * oper->num_bytes; +		qgroup_dirty(fs_info, qgroup); + +		/* Add any parents of the parents */ +		list_for_each_entry(glist, &qgroup->groups, next_group) { +			ret = ulist_add(tmp, glist->group->qgroupid, +					ptr_to_u64(glist->group), GFP_ATOMIC); +			if (ret < 0) +				goto out; +		} +	} +	ret = 0; +out: +	spin_unlock(&fs_info->qgroup_lock); +	ulist_free(tmp); +	return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, +				  u64 root_to_skip, struct ulist *tmp, +				  struct ulist *roots, struct ulist *qgroups, +				  u64 seq, int *old_roots, int rescan)  {  	struct ulist_node *unode;  	struct ulist_iterator uiter; @@ -1211,256 +1403,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,  	ULIST_ITER_INIT(&uiter);  	while ((unode = ulist_next(roots, &uiter))) { +		/* We don't count our current root here */ +		if (unode->val == root_to_skip) +			continue;  		qg = find_qgroup_rb(fs_info, unode->val);  		if (!qg)  			continue; +		/* +		 * We could have a pending removal of this same ref so we may +		 * not have actually found our ref root when doing +		 * btrfs_find_all_roots, so we need to keep track of how many +		 * old roots we find in case we removed ours and added a +		 * different one at the same time.  I don't think this could +		 * happen in practice but that sort of thinking leads to pain +		 * and suffering and to the dark side. +		 */ +		(*old_roots)++;  		ulist_reinit(tmp); -						/* XXX id not needed */ -		ret = ulist_add(tmp, qg->qgroupid, -				(u64)(uintptr_t)qg, GFP_ATOMIC); +		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), +				GFP_ATOMIC); +		if (ret < 0) +			return ret; +		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);  		if (ret < 0)  			return ret;  		ULIST_ITER_INIT(&tmp_uiter);  		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {  			struct btrfs_qgroup_list *glist; -			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; -			if (qg->refcnt < seq) -				qg->refcnt = seq + 1; +			qg = u64_to_ptr(tmp_unode->aux); +			/* +			 * We use this sequence number to keep from having to +			 * run the whole list and 0 out the refcnt every time. +			 * We basically use sequnce as the known 0 count and +			 * then add 1 everytime we see a qgroup.  This is how we +			 * get how many of the roots actually point up to the +			 * upper level qgroups in order to determine exclusive +			 * counts. +			 * +			 * For rescan we want to set old_refcnt to seq so our +			 * exclusive calculations end up correct. +			 */ +			if (rescan) +				qg->old_refcnt = seq; +			else if (qg->old_refcnt < seq) +				qg->old_refcnt = seq + 1;  			else -				++qg->refcnt; +				qg->old_refcnt++; +			if (qg->new_refcnt < seq) +				qg->new_refcnt = seq + 1; +			else +				qg->new_refcnt++;  			list_for_each_entry(glist, &qg->groups, next_group) { +				ret = ulist_add(qgroups, glist->group->qgroupid, +						ptr_to_u64(glist->group), +						GFP_ATOMIC); +				if (ret < 0) +					return ret;  				ret = ulist_add(tmp, glist->group->qgroupid, -						(u64)(uintptr_t)glist->group, +						ptr_to_u64(glist->group),  						GFP_ATOMIC);  				if (ret < 0)  					return ret;  			}  		}  	} +	return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, +				       struct btrfs_qgroup_operation *oper, +				       struct ulist *tmp, +				       struct ulist *qgroups, u64 seq, +				       int *old_roots) +{ +	struct ulist_node *unode; +	struct ulist_iterator uiter; +	struct btrfs_qgroup *qg; +	struct btrfs_qgroup_operation *tmp_oper; +	struct rb_node *n; +	int ret; + +	ulist_reinit(tmp); +	/* +	 * We only walk forward in the tree since we're only interested in +	 * removals that happened _after_  our operation. +	 */ +	spin_lock(&fs_info->qgroup_op_lock); +	n = rb_next(&oper->n); +	spin_unlock(&fs_info->qgroup_op_lock); +	if (!n) +		return 0; +	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); +	while (tmp_oper->bytenr == oper->bytenr) { +		/* +		 * If it's not a removal we don't care, additions work out +		 * properly with our refcnt tracking. +		 */ +		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && +		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) +			goto next; +		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); +		if (!qg) +			goto next; +		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), +				GFP_ATOMIC); +		if (ret) { +			if (ret < 0) +				return ret; +			/* +			 * We only want to increase old_roots if this qgroup is +			 * not already in the list of qgroups.  If it is already +			 * there then that means it must have been re-added or +			 * the delete will be discarded because we had an +			 * existing ref that we haven't looked up yet.  In this +			 * case we don't want to increase old_roots.  So if ret +			 * == 1 then we know that this is the first time we've +			 * seen this qgroup and we can bump the old_roots. +			 */ +			(*old_roots)++; +			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), +					GFP_ATOMIC); +			if (ret < 0) +				return ret; +		} +next: +		spin_lock(&fs_info->qgroup_op_lock); +		n = rb_next(&tmp_oper->n); +		spin_unlock(&fs_info->qgroup_op_lock); +		if (!n) +			break; +		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); +	} + +	/* Ok now process the qgroups we found */ +	ULIST_ITER_INIT(&uiter); +	while ((unode = ulist_next(tmp, &uiter))) { +		struct btrfs_qgroup_list *glist; + +		qg = u64_to_ptr(unode->aux); +		if (qg->old_refcnt < seq) +			qg->old_refcnt = seq + 1; +		else +			qg->old_refcnt++; +		if (qg->new_refcnt < seq) +			qg->new_refcnt = seq + 1; +		else +			qg->new_refcnt++; +		list_for_each_entry(glist, &qg->groups, next_group) { +			ret = ulist_add(qgroups, glist->group->qgroupid, +					ptr_to_u64(glist->group), GFP_ATOMIC); +			if (ret < 0) +				return ret; +			ret = ulist_add(tmp, glist->group->qgroupid, +					ptr_to_u64(glist->group), GFP_ATOMIC); +			if (ret < 0) +				return ret; +		} +	}  	return 0;  } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, -				    struct ulist *roots, struct ulist *tmp, -				    u64 seq, int sgn, u64 num_bytes, -				    struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, +				  struct btrfs_qgroup_operation *oper, +				  struct btrfs_qgroup *qgroup, +				  struct ulist *tmp, struct ulist *qgroups, +				  u64 seq)  {  	struct ulist_node *unode;  	struct ulist_iterator uiter;  	struct btrfs_qgroup *qg; -	struct btrfs_qgroup_list *glist;  	int ret;  	ulist_reinit(tmp); -	ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); +	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), +			GFP_ATOMIC); +	if (ret < 0) +		return ret; +	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), +			GFP_ATOMIC);  	if (ret < 0)  		return ret; -  	ULIST_ITER_INIT(&uiter);  	while ((unode = ulist_next(tmp, &uiter))) { -		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; -		if (qg->refcnt < seq) { -			/* not visited by step 1 */ -			qg->rfer += sgn * num_bytes; -			qg->rfer_cmpr += sgn * num_bytes; -			if (roots->nnodes == 0) { -				qg->excl += sgn * num_bytes; -				qg->excl_cmpr += sgn * num_bytes; -			} -			qgroup_dirty(fs_info, qg); -		} -		WARN_ON(qg->tag >= seq); -		qg->tag = seq; +		struct btrfs_qgroup_list *glist; +		qg = u64_to_ptr(unode->aux); +		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { +			if (qg->new_refcnt < seq) +				qg->new_refcnt = seq + 1; +			else +				qg->new_refcnt++; +		} else { +			if (qg->old_refcnt < seq) +				qg->old_refcnt = seq + 1; +			else +				qg->old_refcnt++; +		}  		list_for_each_entry(glist, &qg->groups, next_group) {  			ret = ulist_add(tmp, glist->group->qgroupid, -					(uintptr_t)glist->group, GFP_ATOMIC); +					ptr_to_u64(glist->group), GFP_ATOMIC); +			if (ret < 0) +				return ret; +			ret = ulist_add(qgroups, glist->group->qgroupid, +					ptr_to_u64(glist->group), GFP_ATOMIC);  			if (ret < 0)  				return ret;  		}  	} -  	return 0;  } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, -				    struct ulist *roots, struct ulist *tmp, -				    u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, +				  u64 root_to_skip, u64 num_bytes, +				  struct ulist *qgroups, u64 seq, +				  int old_roots, int new_roots, int rescan)  {  	struct ulist_node *unode;  	struct ulist_iterator uiter;  	struct btrfs_qgroup *qg; -	struct ulist_node *tmp_unode; -	struct ulist_iterator tmp_uiter; -	int ret; +	u64 cur_new_count, cur_old_count;  	ULIST_ITER_INIT(&uiter); -	while ((unode = ulist_next(roots, &uiter))) { -		qg = find_qgroup_rb(fs_info, unode->val); -		if (!qg) -			continue; +	while ((unode = ulist_next(qgroups, &uiter))) { +		bool dirty = false; -		ulist_reinit(tmp); -		ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); -		if (ret < 0) -			return ret; +		qg = u64_to_ptr(unode->aux); +		/* +		 * Wasn't referenced before but is now, add to the reference +		 * counters. +		 */ +		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { +			qg->rfer += num_bytes; +			qg->rfer_cmpr += num_bytes; +			dirty = true; +		} -		ULIST_ITER_INIT(&tmp_uiter); -		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { -			struct btrfs_qgroup_list *glist; +		/* +		 * Was referenced before but isn't now, subtract from the +		 * reference counters. +		 */ +		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { +			qg->rfer -= num_bytes; +			qg->rfer_cmpr -= num_bytes; +			dirty = true; +		} -			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; -			if (qg->tag == seq) -				continue; +		if (qg->old_refcnt < seq) +			cur_old_count = 0; +		else +			cur_old_count = qg->old_refcnt - seq; +		if (qg->new_refcnt < seq) +			cur_new_count = 0; +		else +			cur_new_count = qg->new_refcnt - seq; -			if (qg->refcnt - seq == roots->nnodes) { -				qg->excl -= sgn * num_bytes; -				qg->excl_cmpr -= sgn * num_bytes; -				qgroup_dirty(fs_info, qg); -			} +		/* +		 * If our refcount was the same as the roots previously but our +		 * new count isn't the same as the number of roots now then we +		 * went from having a exclusive reference on this range to not. +		 */ +		if (old_roots && cur_old_count == old_roots && +		    (cur_new_count != new_roots || new_roots == 0)) { +			WARN_ON(cur_new_count != new_roots && new_roots == 0); +			qg->excl -= num_bytes; +			qg->excl_cmpr -= num_bytes; +			dirty = true; +		} -			list_for_each_entry(glist, &qg->groups, next_group) { -				ret = ulist_add(tmp, glist->group->qgroupid, -						(uintptr_t)glist->group, -						GFP_ATOMIC); -				if (ret < 0) -					return ret; -			} +		/* +		 * If we didn't reference all the roots before but now we do we +		 * have an exclusive reference to this range. +		 */ +		if ((!old_roots || (old_roots && cur_old_count != old_roots)) +		    && cur_new_count == new_roots) { +			qg->excl += num_bytes; +			qg->excl_cmpr += num_bytes; +			dirty = true;  		} -	} +		if (dirty) +			qgroup_dirty(fs_info, qg); +	}  	return 0;  }  /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr.  If we do then we can just discard this operation.   */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, -			     struct btrfs_fs_info *fs_info, -			     struct btrfs_delayed_ref_node *node, -			     struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, +			       struct btrfs_fs_info *fs_info, +			       struct btrfs_qgroup_operation *oper)  { -	struct btrfs_root *quota_root; -	u64 ref_root; -	struct btrfs_qgroup *qgroup;  	struct ulist *roots = NULL; -	u64 seq; +	struct ulist_node *unode; +	struct ulist_iterator uiter;  	int ret = 0; -	int sgn; -	if (!fs_info->quota_enabled) -		return 0; - -	BUG_ON(!fs_info->quota_root); +	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, +				   oper->elem.seq, &roots); +	if (ret < 0) +		return ret; +	ret = 0; -	if (node->type == BTRFS_TREE_BLOCK_REF_KEY || -	    node->type == BTRFS_SHARED_BLOCK_REF_KEY) { -		struct btrfs_delayed_tree_ref *ref; -		ref = btrfs_delayed_node_to_tree_ref(node); -		ref_root = ref->root; -	} else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || -		   node->type == BTRFS_SHARED_DATA_REF_KEY) { -		struct btrfs_delayed_data_ref *ref; -		ref = btrfs_delayed_node_to_data_ref(node); -		ref_root = ref->root; -	} else { -		BUG(); +	ULIST_ITER_INIT(&uiter); +	while ((unode = ulist_next(roots, &uiter))) { +		if (unode->val == oper->ref_root) { +			ret = 1; +			break; +		}  	} +	ulist_free(roots); +	btrfs_put_tree_mod_seq(fs_info, &oper->elem); -	if (!is_fstree(ref_root)) { -		/* -		 * non-fs-trees are not being accounted -		 */ -		return 0; -	} +	return ret; +} -	switch (node->action) { -	case BTRFS_ADD_DELAYED_REF: -	case BTRFS_ADD_DELAYED_EXTENT: -		sgn = 1; -		seq = btrfs_tree_mod_seq_prev(node->seq); -		break; -	case BTRFS_DROP_DELAYED_REF: -		sgn = -1; -		seq = node->seq; -		break; -	case BTRFS_UPDATE_DELAYED_HEAD: -		return 0; -	default: -		BUG(); -	} +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters.  The basic premise is this + * + * 1) We have seq to represent a 0 count.  Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count.  This means that if + * anybody is equal to or below this sequence they were never referenced.  We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently.  This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently.  We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + *			[qgroup 1/0] + *		     /         |          \ + *		[qg 0/0]   [qg 0/1]	[qg 0/2] + *		   \          |            / + *		  [	   extent	    ] + * + * Say we are adding a reference that is covered by qg 0/0.  The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2.  Because it is adding new_roots will be 1.  We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes.  Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, +				    struct btrfs_fs_info *fs_info, +				    struct btrfs_qgroup_operation *oper) +{ +	struct ulist *roots = NULL; +	struct ulist *qgroups, *tmp; +	struct btrfs_qgroup *qgroup; +	struct seq_list elem = {}; +	u64 seq; +	int old_roots = 0; +	int new_roots = 0; +	int ret = 0; -	mutex_lock(&fs_info->qgroup_rescan_lock); -	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { -		if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { -			mutex_unlock(&fs_info->qgroup_rescan_lock); +	if (oper->elem.seq) { +		ret = check_existing_refs(trans, fs_info, oper); +		if (ret < 0) +			return ret; +		if (ret)  			return 0; -		}  	} -	mutex_unlock(&fs_info->qgroup_rescan_lock); -	/* -	 * the delayed ref sequence number we pass depends on the direction of -	 * the operation. for add operations, we pass -	 * tree_mod_log_prev_seq(node->seq) to skip -	 * the delayed ref's current sequence number, because we need the state -	 * of the tree before the add operation. for delete operations, we pass -	 * (node->seq) to include the delayed ref's current sequence number, -	 * because we need the state of the tree after the delete operation. -	 */ -	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); -	if (ret < 0) -		return ret; - -	spin_lock(&fs_info->qgroup_lock); +	qgroups = ulist_alloc(GFP_NOFS); +	if (!qgroups) +		return -ENOMEM; -	quota_root = fs_info->quota_root; -	if (!quota_root) -		goto unlock; +	tmp = ulist_alloc(GFP_NOFS); +	if (!tmp) +		return -ENOMEM; -	qgroup = find_qgroup_rb(fs_info, ref_root); +	btrfs_get_tree_mod_seq(fs_info, &elem); +	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, +				   &roots); +	btrfs_put_tree_mod_seq(fs_info, &elem); +	if (ret < 0) { +		ulist_free(qgroups); +		ulist_free(tmp); +		return ret; +	} +	spin_lock(&fs_info->qgroup_lock); +	qgroup = find_qgroup_rb(fs_info, oper->ref_root);  	if (!qgroup) -		goto unlock; +		goto out; +	seq = fs_info->qgroup_seq;  	/* -	 * step 1: for each old ref, visit all nodes once and inc refcnt +	 * So roots is the list of all the roots currently pointing at the +	 * bytenr, including the ref we are adding if we are adding, or not if +	 * we are removing a ref.  So we pass in the ref_root to skip that root +	 * in our calculations.  We set old_refnct and new_refcnt cause who the +	 * hell knows what everything looked like before, and it doesn't matter +	 * except...  	 */ -	ulist_reinit(fs_info->qgroup_ulist); -	seq = fs_info->qgroup_seq; -	fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ +	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, +				     seq, &old_roots, 0); +	if (ret < 0) +		goto out; -	ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, -				       seq); -	if (ret) -		goto unlock; +	/* +	 * Now adjust the refcounts of the qgroups that care about this +	 * reference, either the old_count in the case of removal or new_count +	 * in the case of an addition. +	 */ +	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, +				     seq); +	if (ret < 0) +		goto out;  	/* -	 * step 2: walk from the new root +	 * ...in the case of removals.  If we had a removal before we got around +	 * to processing this operation then we need to find that guy and count +	 * his references as if they really existed so we don't end up screwing +	 * up the exclusive counts.  Then whenever we go to process the delete +	 * everything will be grand and we can account for whatever exclusive +	 * changes need to be made there.  We also have to pass in old_roots so +	 * we have an accurate count of the roots as it pertains to this +	 * operations view of the world.  	 */ -	ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, -				       seq, sgn, node->num_bytes, qgroup); -	if (ret) -		goto unlock; +	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, +					  &old_roots); +	if (ret < 0) +		goto out;  	/* -	 * step 3: walk again from old refs +	 * We are adding our root, need to adjust up the number of roots, +	 * otherwise old_roots is the number of roots we want.  	 */ -	ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, -				       seq, sgn, node->num_bytes); -	if (ret) -		goto unlock; +	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { +		new_roots = old_roots + 1; +	} else { +		new_roots = old_roots; +		old_roots++; +	} +	fs_info->qgroup_seq += old_roots + 1; -unlock: + +	/* +	 * And now the magic happens, bless Arne for having a pretty elegant +	 * solution for this. +	 */ +	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, +			       qgroups, seq, old_roots, new_roots, 0); +out:  	spin_unlock(&fs_info->qgroup_lock); +	ulist_free(qgroups);  	ulist_free(roots); +	ulist_free(tmp); +	return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, +				struct btrfs_fs_info *fs_info, +				struct btrfs_qgroup_operation *oper) +{ +	int ret = 0; + +	if (!fs_info->quota_enabled) +		return 0; + +	BUG_ON(!fs_info->quota_root); + +	mutex_lock(&fs_info->qgroup_rescan_lock); +	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { +		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { +			mutex_unlock(&fs_info->qgroup_rescan_lock); +			return 0; +		} +	} +	mutex_unlock(&fs_info->qgroup_rescan_lock); + +	ASSERT(is_fstree(oper->ref_root)); + +	switch (oper->type) { +	case BTRFS_QGROUP_OPER_ADD_EXCL: +	case BTRFS_QGROUP_OPER_SUB_EXCL: +		ret = qgroup_excl_accounting(fs_info, oper); +		break; +	case BTRFS_QGROUP_OPER_ADD_SHARED: +	case BTRFS_QGROUP_OPER_SUB_SHARED: +		ret = qgroup_shared_accounting(trans, fs_info, oper); +		break; +	default: +		ASSERT(0); +	} +	return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, +				    struct btrfs_fs_info *fs_info) +{ +	struct btrfs_qgroup_operation *oper; +	int ret = 0; +	while (!list_empty(&trans->qgroup_ref_list)) { +		oper = list_first_entry(&trans->qgroup_ref_list, +					struct btrfs_qgroup_operation, list); +		list_del_init(&oper->list); +		if (!ret || !trans->aborted) +			ret = btrfs_qgroup_account(trans, fs_info, oper); +		spin_lock(&fs_info->qgroup_op_lock); +		rb_erase(&oper->n, &fs_info->qgroup_op_tree); +		spin_unlock(&fs_info->qgroup_op_lock); +		btrfs_put_tree_mod_seq(fs_info, &oper->elem); +		kfree(oper); +	}  	return ret;  } @@ -1629,8 +2114,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,  		srcgroup = find_qgroup_rb(fs_info, srcid);  		if (!srcgroup)  			goto unlock; -		dstgroup->rfer = srcgroup->rfer - level_size; -		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + +		/* +		 * We call inherit after we clone the root in order to make sure +		 * our counts don't go crazy, so at this point the only +		 * difference between the two roots should be the root node. +		 */ +		dstgroup->rfer = srcgroup->rfer; +		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; +		dstgroup->excl = level_size; +		dstgroup->excl_cmpr = level_size;  		srcgroup->excl = level_size;  		srcgroup->excl_cmpr = level_size;  		qgroup_dirty(fs_info, dstgroup); @@ -1734,7 +2227,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  		struct btrfs_qgroup *qg;  		struct btrfs_qgroup_list *glist; -		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; +		qg = u64_to_ptr(unode->aux);  		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&  		    qg->reserved + (s64)qg->rfer + num_bytes > @@ -1766,7 +2259,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)  	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {  		struct btrfs_qgroup *qg; -		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; +		qg = u64_to_ptr(unode->aux);  		qg->reserved += num_bytes;  	} @@ -1812,7 +2305,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)  		struct btrfs_qgroup *qg;  		struct btrfs_qgroup_list *glist; -		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; +		qg = u64_to_ptr(unode->aux);  		qg->reserved -= num_bytes; @@ -1848,15 +2341,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)   */  static int  qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, -		   struct btrfs_trans_handle *trans, struct ulist *tmp, -		   struct extent_buffer *scratch_leaf) +		   struct btrfs_trans_handle *trans, struct ulist *qgroups, +		   struct ulist *tmp, struct extent_buffer *scratch_leaf)  {  	struct btrfs_key found;  	struct ulist *roots = NULL; -	struct ulist_node *unode; -	struct ulist_iterator uiter;  	struct seq_list tree_mod_seq_elem = {}; +	u64 num_bytes;  	u64 seq; +	int new_roots;  	int slot;  	int ret; @@ -1897,8 +2390,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,  	mutex_unlock(&fs_info->qgroup_rescan_lock);  	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { -		u64 num_bytes; -  		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);  		if (found.type != BTRFS_EXTENT_ITEM_KEY &&  		    found.type != BTRFS_METADATA_ITEM_KEY) @@ -1908,76 +2399,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,  		else  			num_bytes = found.offset; -		ret = btrfs_find_all_roots(trans, fs_info, found.objectid, -					   tree_mod_seq_elem.seq, &roots); +		ulist_reinit(qgroups); +		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, +					   &roots);  		if (ret < 0)  			goto out;  		spin_lock(&fs_info->qgroup_lock);  		seq = fs_info->qgroup_seq;  		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ -		ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); -		if (ret) { +		new_roots = 0; +		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, +					     seq, &new_roots, 1); +		if (ret < 0) {  			spin_unlock(&fs_info->qgroup_lock);  			ulist_free(roots);  			goto out;  		} -		/* -		 * step2 of btrfs_qgroup_account_ref works from a single root, -		 * we're doing all at once here. -		 */ -		ulist_reinit(tmp); -		ULIST_ITER_INIT(&uiter); -		while ((unode = ulist_next(roots, &uiter))) { -			struct btrfs_qgroup *qg; - -			qg = find_qgroup_rb(fs_info, unode->val); -			if (!qg) -				continue; - -			ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, -					GFP_ATOMIC); -			if (ret < 0) { -				spin_unlock(&fs_info->qgroup_lock); -				ulist_free(roots); -				goto out; -			} -		} - -		/* this loop is similar to step 2 of btrfs_qgroup_account_ref */ -		ULIST_ITER_INIT(&uiter); -		while ((unode = ulist_next(tmp, &uiter))) { -			struct btrfs_qgroup *qg; -			struct btrfs_qgroup_list *glist; - -			qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; -			qg->rfer += num_bytes; -			qg->rfer_cmpr += num_bytes; -			WARN_ON(qg->tag >= seq); -			if (qg->refcnt - seq == roots->nnodes) { -				qg->excl += num_bytes; -				qg->excl_cmpr += num_bytes; -			} -			qgroup_dirty(fs_info, qg); - -			list_for_each_entry(glist, &qg->groups, next_group) { -				ret = ulist_add(tmp, glist->group->qgroupid, -						(uintptr_t)glist->group, -						GFP_ATOMIC); -				if (ret < 0) { -					spin_unlock(&fs_info->qgroup_lock); -					ulist_free(roots); -					goto out; -				} -			} +		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, +					     seq, 0, new_roots, 1); +		if (ret < 0) { +			spin_unlock(&fs_info->qgroup_lock); +			ulist_free(roots); +			goto out;  		} -  		spin_unlock(&fs_info->qgroup_lock);  		ulist_free(roots); -		ret = 0;  	} -  out:  	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1990,13 +2439,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)  						     qgroup_rescan_work);  	struct btrfs_path *path;  	struct btrfs_trans_handle *trans = NULL; -	struct ulist *tmp = NULL; +	struct ulist *tmp = NULL, *qgroups = NULL;  	struct extent_buffer *scratch_leaf = NULL;  	int err = -ENOMEM;  	path = btrfs_alloc_path();  	if (!path)  		goto out; +	qgroups = ulist_alloc(GFP_NOFS); +	if (!qgroups) +		goto out;  	tmp = ulist_alloc(GFP_NOFS);  	if (!tmp)  		goto out; @@ -2015,7 +2467,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)  			err = -EINTR;  		} else {  			err = qgroup_rescan_leaf(fs_info, path, trans, -						 tmp, scratch_leaf); +						 qgroups, tmp, scratch_leaf);  		}  		if (err > 0)  			btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2025,6 +2477,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)  out:  	kfree(scratch_leaf); +	ulist_free(qgroups);  	ulist_free(tmp);  	btrfs_free_path(path); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 000000000000..5952ff1fbd7a --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction.  The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding.  This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols.  This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { +	BTRFS_QGROUP_OPER_ADD_EXCL, +	BTRFS_QGROUP_OPER_ADD_SHARED, +	BTRFS_QGROUP_OPER_SUB_EXCL, +	BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { +	u64 ref_root; +	u64 bytenr; +	u64 num_bytes; +	u64 seq; +	enum btrfs_qgroup_operation_type type; +	struct seq_list elem; +	struct rb_node n; +	struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, +		       struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, +			struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, +			      struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, +			      struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, +			struct btrfs_fs_info *fs_info, u64 qgroupid, +			char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, +			      struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, +		       struct btrfs_fs_info *fs_info, u64 qgroupid, +		       struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, +			    struct btrfs_fs_info *fs_info, u64 ref_root, +			    u64 bytenr, u64 num_bytes, +			    enum btrfs_qgroup_operation_type type, +			    int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, +				    struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, +				   struct btrfs_fs_info *fs_info, +				   struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, +		      struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, +			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, +			 struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, +			       u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f92ab1daa87..65245a07275b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)  	if (bnode->root)  		fs_info = bnode->root->fs_info;  	btrfs_panic(fs_info, errno, "Inconsistency in backref cache " -		    "found at offset %llu\n", bytenr); +		    "found at offset %llu", bytenr);  }  /* @@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)  {  	struct btrfs_root *reloc_root; -	if (!root->ref_cows) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		return 0;  	reloc_root = root->reloc_root; @@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,  	root = read_fs_root(rc->extent_root->fs_info, root_objectid);  	BUG_ON(IS_ERR(root)); -	if (root->ref_cows && +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&  	    generation != btrfs_root_generation(&root->root_item))  		return NULL; @@ -887,7 +887,7 @@ again:  			goto out;  		} -		if (!root->ref_cows) +		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  			cur->cowonly = 1;  		if (btrfs_root_level(&root->root_item) == cur->level) { @@ -954,7 +954,8 @@ again:  				upper->bytenr = eb->start;  				upper->owner = btrfs_header_owner(eb);  				upper->level = lower->level + 1; -				if (!root->ref_cows) +				if (!test_bit(BTRFS_ROOT_REF_COWS, +					      &root->state))  					upper->cowonly = 1;  				/* @@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)  	if (rb_node) {  		btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "  			    "for start=%llu while inserting into relocation " -			    "tree\n", node->bytenr); +			    "tree", node->bytenr);  		kfree(node);  		return -EEXIST;  	} @@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,  		next = walk_up_backref(next, edges, &index);  		root = next->root;  		BUG_ON(!root); -		BUG_ON(!root->ref_cows); +		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));  		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {  			record_reloc_root_in_trans(trans, root); @@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,  		BUG_ON(!root);  		/* no other choice for non-references counted tree */ -		if (!root->ref_cows) +		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  			return root;  		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  		goto out;  	} -	if (!root || root->ref_cows) { +	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {  		ret = reserve_metadata_space(trans, rc, node);  		if (ret)  			goto out;  	}  	if (root) { -		if (root->ref_cows) { +		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {  			BUG_ON(node->new_bytenr);  			BUG_ON(!list_empty(&node->list));  			btrfs_record_root_in_trans(trans, root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 38bb47e7d6b1..360a728a639f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)  			break;  		} -		root->orphan_item_inserted = 1; +		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);  		err = btrfs_insert_fs_root(root->fs_info, root);  		if (err) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0be77993378e..ac80188eec88 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		do { -			ret = tree_backref_for_extent(&ptr, eb, ei, item_size, -							&ref_root, &ref_level); +			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, +						      item_size, &ref_root, +						      &ref_level);  			printk_in_rcu(KERN_WARNING  				"BTRFS: %s at logical %llu on dev %s, "  				"sector %llu: metadata %s (level %d) in tree " @@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  out:  	if (page)  		put_page(page); -	if (inode) -		iput(inode); + +	iput(inode);  	if (ret < 0)  		return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index eb6537a08c1b..6528aa662181 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -360,10 +360,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)  	/*  	 * First time the inline_buf does not suffice  	 */ -	if (p->buf == p->inline_buf) +	if (p->buf == p->inline_buf) {  		tmp_buf = kmalloc(len, GFP_NOFS); -	else +		if (tmp_buf) +			memcpy(tmp_buf, p->buf, old_buf_len); +	} else {  		tmp_buf = krealloc(p->buf, len, GFP_NOFS); +	}  	if (!tmp_buf)  		return -ENOMEM;  	p->buf = tmp_buf; @@ -972,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	struct btrfs_dir_item *di;  	struct btrfs_key di_key;  	char *buf = NULL; -	const int buf_len = PATH_MAX; +	int buf_len;  	u32 name_len;  	u32 data_len;  	u32 cur; @@ -982,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	int num;  	u8 type; +	if (found_key->type == BTRFS_XATTR_ITEM_KEY) +		buf_len = BTRFS_MAX_XATTR_SIZE(root); +	else +		buf_len = PATH_MAX; +  	buf = kmalloc(buf_len, GFP_NOFS);  	if (!buf) {  		ret = -ENOMEM; @@ -1003,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  		type = btrfs_dir_type(eb, di);  		btrfs_dir_item_key_to_cpu(eb, di, &di_key); -		/* -		 * Path too long -		 */ -		if (name_len + data_len > buf_len) { -			ret = -ENAMETOOLONG; -			goto out; +		if (type == BTRFS_FT_XATTR) { +			if (name_len > XATTR_NAME_MAX) { +				ret = -ENAMETOOLONG; +				goto out; +			} +			if (name_len + data_len > buf_len) { +				ret = -E2BIG; +				goto out; +			} +		} else { +			/* +			 * Path too long +			 */ +			if (name_len + data_len > buf_len) { +				ret = -ENAMETOOLONG; +				goto out; +			}  		}  		read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -1346,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx,  		ret = -EIO;  		btrfs_err(sctx->send_root->fs_info, "did not find backref in "  				"send_root. inode=%llu, offset=%llu, " -				"disk_byte=%llu found extent=%llu\n", +				"disk_byte=%llu found extent=%llu",  				ino, data_offset, disk_byte, found_key.objectid);  		goto out;  	} @@ -1625,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,  		goto out;  	}  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); +	if (key.type == BTRFS_ROOT_ITEM_KEY) { +		ret = -ENOENT; +		goto out; +	}  	*found_inode = key.objectid;  	*found_type = btrfs_dir_type(path->nodes[0], di); @@ -1668,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,  		goto out;  	} -	if (key.type == BTRFS_INODE_REF_KEY) { +	if (found_key.type == BTRFS_INODE_REF_KEY) {  		struct btrfs_inode_ref *iref;  		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],  				      struct btrfs_inode_ref); @@ -1690,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,  		goto out;  	btrfs_release_path(path); -	ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, -			NULL, NULL); -	if (ret < 0) -		goto out; +	if (dir_gen) { +		ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, +				     NULL, NULL, NULL); +		if (ret < 0) +			goto out; +	}  	*dir = parent_dir; @@ -1709,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,  	int ret;  	struct fs_path *tmp_name;  	u64 tmp_dir; -	u64 tmp_dir_gen;  	tmp_name = fs_path_alloc();  	if (!tmp_name)  		return -ENOMEM; -	ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); +	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);  	if (ret < 0)  		goto out; @@ -2026,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  {  	int ret;  	int nce_ret; -	struct btrfs_path *path = NULL;  	struct name_cache_entry *nce = NULL;  	/* @@ -2052,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  		}  	} -	path = alloc_path_for_send(); -	if (!path) -		return -ENOMEM; -  	/*  	 * If the inode is not existent yet, add the orphan name and return 1.  	 * This should only happen for the parent dir that we determine in @@ -2131,7 +2150,6 @@ out_cache:  	name_cache_clean_unused(sctx);  out: -	btrfs_free_path(path);  	return ret;  } @@ -2942,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx,  static int add_pending_dir_move(struct send_ctx *sctx,  				u64 ino,  				u64 ino_gen, -				u64 parent_ino) +				u64 parent_ino, +				struct list_head *new_refs, +				struct list_head *deleted_refs)  {  	struct rb_node **p = &sctx->pending_dir_moves.rb_node;  	struct rb_node *parent = NULL; @@ -2974,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx,  		}  	} -	list_for_each_entry(cur, &sctx->deleted_refs, list) { +	list_for_each_entry(cur, deleted_refs, list) {  		ret = dup_ref(cur, &pm->update_refs);  		if (ret < 0)  			goto out;  	} -	list_for_each_entry(cur, &sctx->new_refs, list) { +	list_for_each_entry(cur, new_refs, list) {  		ret = dup_ref(cur, &pm->update_refs);  		if (ret < 0)  			goto out; @@ -3022,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,  	return NULL;  } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, +		     u64 ino, u64 gen, u64 *ancestor_ino) +{ +	int ret = 0; +	u64 parent_inode = 0; +	u64 parent_gen = 0; +	u64 start_ino = ino; + +	*ancestor_ino = 0; +	while (ino != BTRFS_FIRST_FREE_OBJECTID) { +		fs_path_reset(name); + +		if (is_waiting_for_rm(sctx, ino)) +			break; +		if (is_waiting_for_move(sctx, ino)) { +			if (*ancestor_ino == 0) +				*ancestor_ino = ino; +			ret = get_first_ref(sctx->parent_root, ino, +					    &parent_inode, &parent_gen, name); +		} else { +			ret = __get_cur_name_and_parent(sctx, ino, gen, +							&parent_inode, +							&parent_gen, name); +			if (ret > 0) { +				ret = 0; +				break; +			} +		} +		if (ret < 0) +			break; +		if (parent_inode == start_ino) { +			ret = 1; +			if (*ancestor_ino == 0) +				*ancestor_ino = ino; +			break; +		} +		ino = parent_inode; +		gen = parent_gen; +	} +	return ret; +} +  static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)  {  	struct fs_path *from_path = NULL; @@ -3033,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)  	struct waiting_dir_move *dm = NULL;  	u64 rmdir_ino = 0;  	int ret; +	u64 ancestor = 0;  	name = fs_path_alloc();  	from_path = fs_path_alloc(); @@ -3051,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)  	if (ret < 0)  		goto out; -	if (parent_ino == sctx->cur_ino) { -		/* child only renamed, not moved */ -		ASSERT(parent_gen == sctx->cur_inode_gen); -		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, -				   from_path); -		if (ret < 0) -			goto out; -		ret = fs_path_add_path(from_path, name); -		if (ret < 0) -			goto out; -	} else { -		/* child moved and maybe renamed too */ -		sctx->send_progress = pm->ino; -		ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); +	ret = get_cur_path(sctx, parent_ino, parent_gen, +			   from_path); +	if (ret < 0) +		goto out; +	ret = fs_path_add_path(from_path, name); +	if (ret < 0) +		goto out; + +	sctx->send_progress = sctx->cur_ino + 1; +	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); +	if (ret) { +		LIST_HEAD(deleted_refs); +		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); +		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, +					   &pm->update_refs, &deleted_refs);  		if (ret < 0)  			goto out; -	} - -	fs_path_free(name); -	name = NULL; - -	to_path = fs_path_alloc(); -	if (!to_path) { -		ret = -ENOMEM; +		if (rmdir_ino) { +			dm = get_waiting_dir_move(sctx, pm->ino); +			ASSERT(dm); +			dm->rmdir_ino = rmdir_ino; +		}  		goto out;  	} - -	sctx->send_progress = sctx->cur_ino + 1; +	fs_path_reset(name); +	to_path = name; +	name = NULL;  	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);  	if (ret < 0)  		goto out; @@ -3202,127 +3264,74 @@ out:  static int wait_for_parent_move(struct send_ctx *sctx,  				struct recorded_ref *parent_ref)  { -	int ret; +	int ret = 0;  	u64 ino = parent_ref->dir;  	u64 parent_ino_before, parent_ino_after; -	u64 old_gen;  	struct fs_path *path_before = NULL;  	struct fs_path *path_after = NULL;  	int len1, len2; -	int register_upper_dirs; -	u64 gen; - -	if (is_waiting_for_move(sctx, ino)) -		return 1; - -	if (parent_ref->dir <= sctx->cur_ino) -		return 0; - -	ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, -			     NULL, NULL, NULL, NULL); -	if (ret == -ENOENT) -		return 0; -	else if (ret < 0) -		return ret; - -	if (parent_ref->dir_gen != old_gen) -		return 0; - -	path_before = fs_path_alloc(); -	if (!path_before) -		return -ENOMEM; - -	ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, -			    NULL, path_before); -	if (ret == -ENOENT) { -		ret = 0; -		goto out; -	} else if (ret < 0) { -		goto out; -	}  	path_after = fs_path_alloc(); -	if (!path_after) { +	path_before = fs_path_alloc(); +	if (!path_after || !path_before) {  		ret = -ENOMEM;  		goto out;  	} -	ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, -			    &gen, path_after); -	if (ret == -ENOENT) { -		ret = 0; -		goto out; -	} else if (ret < 0) { -		goto out; -	} - -	len1 = fs_path_len(path_before); -	len2 = fs_path_len(path_after); -	if (parent_ino_before != parent_ino_after || len1 != len2 || -	     memcmp(path_before->start, path_after->start, len1)) { -		ret = 1; -		goto out; -	} -	ret = 0; -  	/* -	 * Ok, our new most direct ancestor has a higher inode number but -	 * wasn't moved/renamed. So maybe some of the new ancestors higher in -	 * the hierarchy have an higher inode number too *and* were renamed -	 * or moved - in this case we need to wait for the ancestor's rename -	 * or move operation before we can do the move/rename for the current -	 * inode. +	 * Our current directory inode may not yet be renamed/moved because some +	 * ancestor (immediate or not) has to be renamed/moved first. So find if +	 * such ancestor exists and make sure our own rename/move happens after +	 * that ancestor is processed.  	 */ -	register_upper_dirs = 0; -	ino = parent_ino_after; -again: -	while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { -		u64 parent_gen; +	while (ino > BTRFS_FIRST_FREE_OBJECTID) { +		if (is_waiting_for_move(sctx, ino)) { +			ret = 1; +			break; +		}  		fs_path_reset(path_before);  		fs_path_reset(path_after);  		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, -				    &parent_gen, path_after); +				    NULL, path_after);  		if (ret < 0)  			goto out;  		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,  				    NULL, path_before); -		if (ret == -ENOENT) { -			ret = 0; -			break; -		} else if (ret < 0) { +		if (ret < 0 && ret != -ENOENT) {  			goto out; +		} else if (ret == -ENOENT) { +			ret = 1; +			break;  		}  		len1 = fs_path_len(path_before);  		len2 = fs_path_len(path_after); -		if (parent_ino_before != parent_ino_after || len1 != len2 || -		    memcmp(path_before->start, path_after->start, len1)) { +		if (ino > sctx->cur_ino && +		    (parent_ino_before != parent_ino_after || len1 != len2 || +		     memcmp(path_before->start, path_after->start, len1))) {  			ret = 1; -			if (register_upper_dirs) { -				break; -			} else { -				register_upper_dirs = 1; -				ino = parent_ref->dir; -				gen = parent_ref->dir_gen; -				goto again; -			} -		} else if (register_upper_dirs) { -			ret = add_pending_dir_move(sctx, ino, gen, -						   parent_ino_after); -			if (ret < 0 && ret != -EEXIST) -				goto out; +			break;  		} -  		ino = parent_ino_after; -		gen = parent_gen;  	}  out:  	fs_path_free(path_before);  	fs_path_free(path_after); +	if (ret == 1) { +		ret = add_pending_dir_move(sctx, +					   sctx->cur_ino, +					   sctx->cur_inode_gen, +					   ino, +					   &sctx->new_refs, +					   &sctx->deleted_refs); +		if (!ret) +			ret = 1; +	} +  	return ret;  } @@ -3483,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  				if (ret < 0)  					goto out;  				if (ret) { -					ret = add_pending_dir_move(sctx, -							   sctx->cur_ino, -							   sctx->cur_inode_gen, -							   cur->dir);  					*pending_move = 1;  				} else {  					ret = send_rename(sctx, valid_path, @@ -5487,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)  	 */  	if (root->send_in_progress < 0)  		btrfs_err(root->fs_info, -			"send_in_progres unbalanced %d root %llu\n", +			"send_in_progres unbalanced %d root %llu",  			root->send_in_progress, root->root_key.objectid);  	spin_unlock(&root->root_item_lock);  } @@ -5515,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	/*  	 * The subvolume must remain read-only during send, protect against -	 * making it RW. +	 * making it RW. This also protects against deletion.  	 */  	spin_lock(&send_root->root_item_lock);  	send_root->send_in_progress++; @@ -5575,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	}  	sctx->send_root = send_root; +	/* +	 * Unlikely but possible, if the subvolume is marked for deletion but +	 * is slow to remove the directory entry, send can still be started +	 */ +	if (btrfs_root_dead(sctx->send_root)) { +		ret = -EPERM; +		goto out; +	} +  	sctx->clone_roots_cnt = arg->clone_sources_count;  	sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -5664,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		spin_lock(&sctx->parent_root->root_item_lock);  		sctx->parent_root->send_in_progress++; -		if (!btrfs_root_readonly(sctx->parent_root)) { +		if (!btrfs_root_readonly(sctx->parent_root) || +				btrfs_root_dead(sctx->parent_root)) {  			spin_unlock(&sctx->parent_root->root_item_lock);  			srcu_read_unlock(&fs_info->subvol_srcu, index);  			ret = -EPERM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9601d25a4607..4662d92a4b73 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			} else if (compress) {  				if (!btrfs_test_opt(root, COMPRESS))  					btrfs_info(root->fs_info, -						   "btrfs: use %s compression\n", +						   "btrfs: use %s compression",  						   compress_type);  			}  			break; @@ -580,8 +580,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			}  			break;  		case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL  			root->fs_info->sb->s_flags |= MS_POSIXACL;  			break; +#else +			btrfs_err(root->fs_info, +				"support for ACL not compiled in!"); +			ret = -EINVAL; +			goto out; +#endif  		case Opt_noacl:  			root->fs_info->sb->s_flags &= ~MS_POSIXACL;  			break; @@ -1413,6 +1420,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		 * this also happens on 'umount -rf' or on shutdown, when  		 * the filesystem is busy.  		 */ +		cancel_work_sync(&fs_info->async_reclaim_work);  		/* wait for the uuid_scan task to finish */  		down(&fs_info->uuid_tree_rescan_sem); @@ -1894,6 +1902,9 @@ static int btrfs_run_sanity_tests(void)  	if (ret)  		goto out;  	ret = btrfs_test_inodes(); +	if (ret) +		goto out; +	ret = btrfs_test_qgroups();  out:  	btrfs_destroy_test_fs();  	return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c5eb2143dc66..df39458f1487 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,  BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);  #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)  static ssize_t raid_bytes_show(struct kobject *kobj,  			       struct kobj_attribute *attr, char *buf); @@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,  {  	struct btrfs_space_info *sinfo = to_space_info(kobj->parent);  	struct btrfs_block_group_cache *block_group; -	int index = kobj - sinfo->block_group_kobjs; +	int index = to_raid_kobj(kobj)->raid_type;  	u64 val = 0;  	down_read(&sinfo->groups_sem); @@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = {  static void release_raid_kobj(struct kobject *kobj)  { -	kobject_put(kobj->parent); +	kfree(to_raid_kobj(kobj));  }  struct kobj_type btrfs_raid_ktype = { @@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,  	struct btrfs_root *root = fs_info->fs_root;  	int ret; -	if (len >= BTRFS_LABEL_SIZE) { -		pr_err("BTRFS: unable to set label with more than %d bytes\n", -		       BTRFS_LABEL_SIZE - 1); +	if (len >= BTRFS_LABEL_SIZE)  		return -EINVAL; -	}  	trans = btrfs_start_transaction(root, 0);  	if (IS_ERR(trans)) @@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj,  }  BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); +static ssize_t btrfs_no_store(struct kobject *kobj, +				 struct kobj_attribute *a, +				 const char *buf, size_t len) +{ +	return -EPERM; +} + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, +				struct kobj_attribute *a, char *buf) +{ +	struct btrfs_fs_info *fs_info = to_fs_info(kobj); + +	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, +				struct kobj_attribute *a, char *buf) +{ +	struct btrfs_fs_info *fs_info = to_fs_info(kobj); + +	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, +				struct kobj_attribute *a, char *buf) +{ +	struct btrfs_fs_info *fs_info = to_fs_info(kobj); + +	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); +  static struct attribute *btrfs_attrs[] = {  	BTRFS_ATTR_PTR(label), +	BTRFS_ATTR_PTR(nodesize), +	BTRFS_ATTR_PTR(sectorsize), +	BTRFS_ATTR_PTR(clone_alignment),  	NULL,  }; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 757ef00a75a4..a5dcacb5df9c 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@  #include <linux/magic.h>  #include "btrfs-tests.h"  #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h"  static struct vfsmount *test_mnt = NULL; @@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void)  	kern_unmount(test_mnt);  	unregister_filesystem(&test_type);  } + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ +	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), +						GFP_NOFS); + +	if (!fs_info) +		return fs_info; +	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), +				      GFP_NOFS); +	if (!fs_info->fs_devices) { +		kfree(fs_info); +		return NULL; +	} +	fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), +				      GFP_NOFS); +	if (!fs_info->super_copy) { +		kfree(fs_info->fs_devices); +		kfree(fs_info); +		return NULL; +	} + +	if (init_srcu_struct(&fs_info->subvol_srcu)) { +		kfree(fs_info->fs_devices); +		kfree(fs_info->super_copy); +		kfree(fs_info); +		return NULL; +	} + +	spin_lock_init(&fs_info->buffer_lock); +	spin_lock_init(&fs_info->qgroup_lock); +	spin_lock_init(&fs_info->qgroup_op_lock); +	spin_lock_init(&fs_info->super_lock); +	spin_lock_init(&fs_info->fs_roots_radix_lock); +	spin_lock_init(&fs_info->tree_mod_seq_lock); +	mutex_init(&fs_info->qgroup_ioctl_lock); +	mutex_init(&fs_info->qgroup_rescan_lock); +	rwlock_init(&fs_info->tree_mod_log_lock); +	fs_info->running_transaction = NULL; +	fs_info->qgroup_tree = RB_ROOT; +	fs_info->qgroup_ulist = NULL; +	atomic64_set(&fs_info->tree_mod_seq, 0); +	INIT_LIST_HEAD(&fs_info->dirty_qgroups); +	INIT_LIST_HEAD(&fs_info->dead_roots); +	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); +	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); +	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); +	return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ +	struct radix_tree_iter iter; +	void **slot; + +	spin_lock(&fs_info->buffer_lock); +restart: +	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { +		struct extent_buffer *eb; + +		eb = radix_tree_deref_slot(slot); +		if (!eb) +			continue; +		/* Shouldn't happen but that kind of thinking creates CVE's */ +		if (radix_tree_exception(eb)) { +			if (radix_tree_deref_retry(eb)) +				goto restart; +			continue; +		} +		spin_unlock(&fs_info->buffer_lock); +		free_extent_buffer_stale(eb); +		spin_lock(&fs_info->buffer_lock); +	} +	spin_unlock(&fs_info->buffer_lock); + +	btrfs_free_qgroup_config(fs_info); +	btrfs_free_fs_roots(fs_info); +	cleanup_srcu_struct(&fs_info->subvol_srcu); +	kfree(fs_info->super_copy); +	kfree(fs_info->fs_devices); +	kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ +	if (!root) +		return; +	if (root->node) +		free_extent_buffer(root->node); +	if (root->fs_info) +		btrfs_free_dummy_fs_info(root->fs_info); +	kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 312560a9123d..fd3954224480 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,13 +23,18 @@  #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +struct btrfs_root; +  int btrfs_test_free_space_cache(void);  int btrfs_test_extent_buffer_operations(void);  int btrfs_test_extent_io(void);  int btrfs_test_inodes(void); +int btrfs_test_qgroups(void);  int btrfs_init_test_fs(void);  void btrfs_destroy_test_fs(void);  struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root);  #else  static inline int btrfs_test_free_space_cache(void)  { @@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void)  {  	return 0;  } +static inline int btrfs_test_qgroups(void) +{ +	return 0; +}  #endif  #endif diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 397d1f99a8eb..3ae0f5b8bb80 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -23,33 +23,6 @@  #include "../extent_io.h"  #include "../volumes.h" -static struct btrfs_fs_info *alloc_dummy_fs_info(void) -{ -	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), -						GFP_NOFS); -	if (!fs_info) -		return fs_info; -	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), -				      GFP_NOFS); -	if (!fs_info->fs_devices) { -		kfree(fs_info); -		return NULL; -	} -	return fs_info; -} -static void free_dummy_root(struct btrfs_root *root) -{ -	if (!root) -		return; -	if (root->fs_info) { -		kfree(root->fs_info->fs_devices); -		kfree(root->fs_info); -	} -	if (root->node) -		free_extent_buffer(root->node); -	kfree(root); -} -  static void insert_extent(struct btrfs_root *root, u64 start, u64 len,  			  u64 ram_bytes, u64 offset, u64 disk_bytenr,  			  u64 disk_len, u32 type, u8 compression, int slot) @@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void)  	 * We do this since btrfs_get_extent wants to assign em->bdev to  	 * root->fs_info->fs_devices->latest_bdev.  	 */ -	root->fs_info = alloc_dummy_fs_info(); +	root->fs_info = btrfs_alloc_dummy_fs_info();  	if (!root->fs_info) {  		test_msg("Couldn't allocate dummy fs info\n");  		goto out; @@ -837,7 +810,7 @@ out:  	if (!IS_ERR(em))  		free_extent_map(em);  	iput(inode); -	free_dummy_root(root); +	btrfs_free_dummy_root(root);  	return ret;  } @@ -864,7 +837,7 @@ static int test_hole_first(void)  		goto out;  	} -	root->fs_info = alloc_dummy_fs_info(); +	root->fs_info = btrfs_alloc_dummy_fs_info();  	if (!root->fs_info) {  		test_msg("Couldn't allocate dummy fs info\n");  		goto out; @@ -934,7 +907,7 @@ out:  	if (!IS_ERR(em))  		free_extent_map(em);  	iput(inode); -	free_dummy_root(root); +	btrfs_free_dummy_root(root);  	return ret;  } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 000000000000..fa691b754aaf --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,468 @@ +/* + * Copyright (C) 2013 Facebook.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ +	memset(trans, 0, sizeof(*trans)); +	trans->transid = 1; +	INIT_LIST_HEAD(&trans->qgroup_ref_list); +	trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, +				  u64 num_bytes, u64 parent, u64 root_objectid) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_extent_item *item; +	struct btrfs_extent_inline_ref *iref; +	struct btrfs_tree_block_info *block_info; +	struct btrfs_path *path; +	struct extent_buffer *leaf; +	struct btrfs_key ins; +	u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); +	int ret; + +	init_dummy_trans(&trans); + +	ins.objectid = bytenr; +	ins.type = BTRFS_EXTENT_ITEM_KEY; +	ins.offset = num_bytes; + +	path = btrfs_alloc_path(); +	if (!path) { +		test_msg("Couldn't allocate path\n"); +		return -ENOMEM; +	} + +	path->leave_spinning = 1; +	ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); +	if (ret) { +		test_msg("Couldn't insert ref %d\n", ret); +		btrfs_free_path(path); +		return ret; +	} + +	leaf = path->nodes[0]; +	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); +	btrfs_set_extent_refs(leaf, item, 1); +	btrfs_set_extent_generation(leaf, item, 1); +	btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); +	block_info = (struct btrfs_tree_block_info *)(item + 1); +	btrfs_set_tree_block_level(leaf, block_info, 1); +	iref = (struct btrfs_extent_inline_ref *)(block_info + 1); +	if (parent > 0) { +		btrfs_set_extent_inline_ref_type(leaf, iref, +						 BTRFS_SHARED_BLOCK_REF_KEY); +		btrfs_set_extent_inline_ref_offset(leaf, iref, parent); +	} else { +		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); +		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); +	} +	btrfs_free_path(path); +	return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, +			u64 parent, u64 root_objectid) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_extent_item *item; +	struct btrfs_path *path; +	struct btrfs_key key; +	u64 refs; +	int ret; + +	init_dummy_trans(&trans); + +	key.objectid = bytenr; +	key.type = BTRFS_EXTENT_ITEM_KEY; +	key.offset = num_bytes; + +	path = btrfs_alloc_path(); +	if (!path) { +		test_msg("Couldn't allocate path\n"); +		return -ENOMEM; +	} + +	path->leave_spinning = 1; +	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); +	if (ret) { +		test_msg("Couldn't find extent ref\n"); +		btrfs_free_path(path); +		return ret; +	} + +	item = btrfs_item_ptr(path->nodes[0], path->slots[0], +			      struct btrfs_extent_item); +	refs = btrfs_extent_refs(path->nodes[0], item); +	btrfs_set_extent_refs(path->nodes[0], item, refs + 1); +	btrfs_release_path(path); + +	key.objectid = bytenr; +	if (parent) { +		key.type = BTRFS_SHARED_BLOCK_REF_KEY; +		key.offset = parent; +	} else { +		key.type = BTRFS_TREE_BLOCK_REF_KEY; +		key.offset = root_objectid; +	} + +	ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); +	if (ret) +		test_msg("Failed to insert backref\n"); +	btrfs_free_path(path); +	return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, +			      u64 num_bytes) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_key key; +	struct btrfs_path *path; +	int ret; + +	init_dummy_trans(&trans); + +	key.objectid = bytenr; +	key.type = BTRFS_EXTENT_ITEM_KEY; +	key.offset = num_bytes; + +	path = btrfs_alloc_path(); +	if (!path) { +		test_msg("Couldn't allocate path\n"); +		return -ENOMEM; +	} +	path->leave_spinning = 1; + +	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); +	if (ret) { +		test_msg("Didn't find our key %d\n", ret); +		btrfs_free_path(path); +		return ret; +	} +	btrfs_del_item(&trans, root, path); +	btrfs_free_path(path); +	return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, +			     u64 num_bytes, u64 parent, u64 root_objectid) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_extent_item *item; +	struct btrfs_path *path; +	struct btrfs_key key; +	u64 refs; +	int ret; + +	init_dummy_trans(&trans); + +	key.objectid = bytenr; +	key.type = BTRFS_EXTENT_ITEM_KEY; +	key.offset = num_bytes; + +	path = btrfs_alloc_path(); +	if (!path) { +		test_msg("Couldn't allocate path\n"); +		return -ENOMEM; +	} + +	path->leave_spinning = 1; +	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); +	if (ret) { +		test_msg("Couldn't find extent ref\n"); +		btrfs_free_path(path); +		return ret; +	} + +	item = btrfs_item_ptr(path->nodes[0], path->slots[0], +			      struct btrfs_extent_item); +	refs = btrfs_extent_refs(path->nodes[0], item); +	btrfs_set_extent_refs(path->nodes[0], item, refs - 1); +	btrfs_release_path(path); + +	key.objectid = bytenr; +	if (parent) { +		key.type = BTRFS_SHARED_BLOCK_REF_KEY; +		key.offset = parent; +	} else { +		key.type = BTRFS_TREE_BLOCK_REF_KEY; +		key.offset = root_objectid; +	} + +	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); +	if (ret) { +		test_msg("Couldn't find backref %d\n", ret); +		btrfs_free_path(path); +		return ret; +	} +	btrfs_del_item(&trans, root, path); +	btrfs_free_path(path); +	return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_fs_info *fs_info = root->fs_info; +	int ret; + +	init_dummy_trans(&trans); + +	test_msg("Qgroup basic add\n"); +	ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); +	if (ret) { +		test_msg("Couldn't create a qgroup %d\n", ret); +		return ret; +	} + +	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, +				      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +	if (ret) { +		test_msg("Couldn't add space to a qgroup %d\n", ret); +		return ret; +	} + +	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); +	if (ret) +		return ret; + +	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); +	if (ret) { +		test_msg("Delayed qgroup accounting failed %d\n", ret); +		return ret; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	ret = remove_extent_item(root, 4096, 4096); +	if (ret) +		return -EINVAL; + +	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, +				      BTRFS_QGROUP_OPER_SUB_EXCL, 0); +	if (ret) { +		test_msg("Couldn't remove space from the qgroup %d\n", ret); +		return -EINVAL; +	} + +	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); +	if (ret) { +		test_msg("Qgroup accounting failed %d\n", ret); +		return -EINVAL; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ +	struct btrfs_trans_handle trans; +	struct btrfs_fs_info *fs_info = root->fs_info; +	int ret; + +	init_dummy_trans(&trans); + +	test_msg("Qgroup multiple refs test\n"); + +	/* We have 5 created already from the previous test */ +	ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); +	if (ret) { +		test_msg("Couldn't create a qgroup %d\n", ret); +		return ret; +	} + +	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); +	if (ret) +		return ret; + +	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, +				      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +	if (ret) { +		test_msg("Couldn't add space to a qgroup %d\n", ret); +		return ret; +	} + +	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); +	if (ret) { +		test_msg("Delayed qgroup accounting failed %d\n", ret); +		return ret; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	ret = add_tree_ref(root, 4096, 4096, 0, 256); +	if (ret) +		return ret; + +	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, +				      BTRFS_QGROUP_OPER_ADD_SHARED, 0); +	if (ret) { +		test_msg("Qgroup record ref failed %d\n", ret); +		return ret; +	} + +	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); +	if (ret) { +		test_msg("Qgroup accounting failed %d\n", ret); +		return ret; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	ret = remove_extent_ref(root, 4096, 4096, 0, 256); +	if (ret) +		return ret; + +	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, +				      BTRFS_QGROUP_OPER_SUB_SHARED, 0); +	if (ret) { +		test_msg("Qgroup record ref failed %d\n", ret); +		return ret; +	} + +	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); +	if (ret) { +		test_msg("Qgroup accounting failed %d\n", ret); +		return ret; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { +		test_msg("Qgroup counts didn't match expected values\n"); +		return -EINVAL; +	} + +	return 0; +} + +int btrfs_test_qgroups(void) +{ +	struct btrfs_root *root; +	struct btrfs_root *tmp_root; +	int ret = 0; + +	root = btrfs_alloc_dummy_root(); +	if (IS_ERR(root)) { +		test_msg("Couldn't allocate root\n"); +		return PTR_ERR(root); +	} + +	root->fs_info = btrfs_alloc_dummy_fs_info(); +	if (!root->fs_info) { +		test_msg("Couldn't allocate dummy fs info\n"); +		ret = -ENOMEM; +		goto out; +	} + +	/* +	 * Can't use bytenr 0, some things freak out +	 * *cough*backref walking code*cough* +	 */ +	root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); +	if (!root->node) { +		test_msg("Couldn't allocate dummy buffer\n"); +		ret = -ENOMEM; +		goto out; +	} +	root->alloc_bytenr += 8192; + +	tmp_root = btrfs_alloc_dummy_root(); +	if (IS_ERR(tmp_root)) { +		test_msg("Couldn't allocate a fs root\n"); +		ret = PTR_ERR(tmp_root); +		goto out; +	} + +	tmp_root->root_key.objectid = 5; +	root->fs_info->fs_root = tmp_root; +	ret = btrfs_insert_fs_root(root->fs_info, tmp_root); +	if (ret) { +		test_msg("Couldn't insert fs root %d\n", ret); +		goto out; +	} + +	tmp_root = btrfs_alloc_dummy_root(); +	if (IS_ERR(tmp_root)) { +		test_msg("Couldn't allocate a fs root\n"); +		ret = PTR_ERR(tmp_root); +		goto out; +	} + +	tmp_root->root_key.objectid = 256; +	ret = btrfs_insert_fs_root(root->fs_info, tmp_root); +	if (ret) { +		test_msg("Couldn't insert fs root %d\n", ret); +		goto out; +	} + +	/* We are using this root as our extent root */ +	root->fs_info->extent_root = root; + +	/* +	 * Some of the paths we test assume we have a filled out fs_info, so we +	 * just need to addt he root in there so we don't panic. +	 */ +	root->fs_info->tree_root = root; +	root->fs_info->quota_root = root; +	root->fs_info->quota_enabled = 1; + +	test_msg("Running qgroup tests\n"); +	ret = test_no_shared_qgroup(root); +	if (ret) +		goto out; +	ret = test_multiple_refs(root); +out: +	btrfs_free_dummy_root(root); +	return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7579f6d0b854..9630f10f8e1e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@  #include "inode-map.h"  #include "volumes.h"  #include "dev-replace.h" +#include "qgroup.h"  #define BTRFS_ROOT_TRANS_TAG 0 @@ -241,18 +242,19 @@ loop:  static int record_root_in_trans(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root)  { -	if (root->ref_cows && root->last_trans < trans->transid) { +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +	    root->last_trans < trans->transid) {  		WARN_ON(root == root->fs_info->extent_root);  		WARN_ON(root->commit_root != root->node);  		/* -		 * see below for in_trans_setup usage rules +		 * see below for IN_TRANS_SETUP usage rules  		 * we have the reloc mutex held now, so there  		 * is only one writer in this function  		 */ -		root->in_trans_setup = 1; +		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); -		/* make sure readers find in_trans_setup before +		/* make sure readers find IN_TRANS_SETUP before  		 * they find our root->last_trans update  		 */  		smp_wmb(); @@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,  		 * But, we have to set root->last_trans before we  		 * init the relocation root, otherwise, we trip over warnings  		 * in ctree.c.  The solution used here is to flag ourselves -		 * with root->in_trans_setup.  When this is 1, we're still +		 * with root IN_TRANS_SETUP.  When this is 1, we're still  		 * fixing up the reloc trees and everyone must wait.  		 *  		 * When this is zero, they can trust root->last_trans and fly @@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,  		 * done before we pop in the zero below  		 */  		btrfs_init_reloc_root(trans, root); -		smp_wmb(); -		root->in_trans_setup = 0; +		smp_mb__before_atomic(); +		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);  	}  	return 0;  } @@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,  int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,  			       struct btrfs_root *root)  { -	if (!root->ref_cows) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		return 0;  	/* -	 * see record_root_in_trans for comments about in_trans_setup usage +	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage  	 * and barriers  	 */  	smp_rmb();  	if (root->last_trans == trans->transid && -	    !root->in_trans_setup) +	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))  		return 0;  	mutex_lock(&root->fs_info->reloc_mutex); @@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)  static inline bool need_reserve_reloc_root(struct btrfs_root *root)  {  	if (!root->fs_info->reloc_ctl || -	    !root->ref_cows || +	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||  	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||  	    root->reloc_root)  		return false; @@ -695,6 +697,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	unsigned long cur = trans->delayed_ref_updates;  	int lock = (trans->type != TRANS_JOIN_NOLOCK);  	int err = 0; +	int must_run_delayed_refs = 0;  	if (trans->use_count > 1) {  		trans->use_count--; @@ -702,14 +705,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		return 0;  	} -	/* -	 * do the qgroup accounting as early as possible -	 */ -	err = btrfs_delayed_refs_qgroup_accounting(trans, info); -  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; +	if (!list_empty(&trans->new_bgs)) +		btrfs_create_pending_block_groups(trans, root); + +	trans->delayed_ref_updates = 0; +	if (!trans->sync) { +		must_run_delayed_refs = +			btrfs_should_throttle_delayed_refs(trans, root); +		cur = max_t(unsigned long, cur, 32); + +		/* +		 * don't make the caller wait if they are from a NOLOCK +		 * or ATTACH transaction, it will deadlock with commit +		 */ +		if (must_run_delayed_refs == 1 && +		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) +			must_run_delayed_refs = 2; +	} +  	if (trans->qgroup_reserved) {  		/*  		 * the same root has to be passed here between start_transaction @@ -719,16 +735,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  		trans->qgroup_reserved = 0;  	} -	if (!list_empty(&trans->new_bgs)) -		btrfs_create_pending_block_groups(trans, root); - -	trans->delayed_ref_updates = 0; -	if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { -		cur = max_t(unsigned long, cur, 32); -		trans->delayed_ref_updates = 0; -		btrfs_run_delayed_refs(trans, root, cur); -	} -  	btrfs_trans_release_metadata(trans, root);  	trans->block_rsv = NULL; @@ -778,6 +784,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	assert_qgroups_uptodate(trans);  	kmem_cache_free(btrfs_trans_handle_cachep, trans); +	if (must_run_delayed_refs) { +		btrfs_async_run_delayed_refs(root, cur, +					     must_run_delayed_refs == 1); +	}  	return err;  } @@ -1049,8 +1059,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,  			btrfs_save_ino_cache(root, trans);  			/* see comments in should_cow_block() */ -			root->force_cow = 0; -			smp_wmb(); +			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); +			smp_mb__after_atomic();  			if (root->commit_root != root->node) {  				list_add_tail(&root->dirty_list, @@ -1081,7 +1091,7 @@ int btrfs_defrag_root(struct btrfs_root *root)  	struct btrfs_trans_handle *trans;  	int ret; -	if (xchg(&root->defrag_running, 1)) +	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))  		return 0;  	while (1) { @@ -1104,7 +1114,7 @@ int btrfs_defrag_root(struct btrfs_root *root)  			break;  		}  	} -	root->defrag_running = 0; +	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);  	return ret;  } @@ -1168,12 +1178,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  			goto no_free_objectid;  	} -	pending->error = btrfs_qgroup_inherit(trans, fs_info, -					      root->root_key.objectid, -					      objectid, pending->inherit); -	if (pending->error) -		goto no_free_objectid; -  	key.objectid = objectid;  	key.offset = (u64)-1;  	key.type = BTRFS_ROOT_ITEM_KEY; @@ -1270,8 +1274,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  		goto fail;  	} +	/* +	 * We need to flush delayed refs in order to make sure all of our quota +	 * operations have been done before we call btrfs_qgroup_inherit. +	 */ +	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto fail; +	} + +	pending->error = btrfs_qgroup_inherit(trans, fs_info, +					      root->root_key.objectid, +					      objectid, pending->inherit); +	if (pending->error) +		goto no_free_objectid; +  	/* see comments in should_cow_block() */ -	root->force_cow = 1; +	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);  	smp_wmb();  	btrfs_set_root_node(new_root_item, tmp); @@ -1598,12 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,  	 * them now so that they hinder processing of more delayed refs  	 * as little as possible.  	 */ -	if (ret) { -		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); -		return ret; -	} - -	ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);  	if (ret)  		return ret; @@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)  	}  	root = list_first_entry(&fs_info->dead_roots,  			struct btrfs_root, root_list); -	/* -	 * Make sure root is not involved in send, -	 * if we fail with first root, we return -	 * directly rather than continue. -	 */ -	spin_lock(&root->root_item_lock); -	if (root->send_in_progress) { -		spin_unlock(&fs_info->trans_lock); -		spin_unlock(&root->root_item_lock); -		return 0; -	} -	spin_unlock(&root->root_item_lock); -  	list_del_init(&root->root_list);  	spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b57b924e8e03..7dd558ed0716 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -69,6 +69,7 @@ struct btrfs_transaction {  #define __TRANS_ATTACH		(1U << 10)  #define __TRANS_JOIN		(1U << 11)  #define __TRANS_JOIN_NOLOCK	(1U << 12) +#define __TRANS_DUMMY		(1U << 13)  #define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)  #define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE) diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 76928ca97741..a63719cc9578 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  		goto out;  	} -	if (root->ref_cows == 0) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		goto out;  	if (btrfs_test_opt(root, SSD)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2f45fc02610..9e1f2cd5e67a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,13 +20,11 @@  #include <linux/slab.h>  #include <linux/blkdev.h>  #include <linux/list_sort.h> -#include "ctree.h" -#include "transaction.h" +#include "tree-log.h"  #include "disk-io.h"  #include "locking.h"  #include "print-tree.h"  #include "backref.h" -#include "tree-log.h"  #include "hash.h"  /* magic values for the inode_only field in btrfs_log_inode: @@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans,  	mutex_lock(&root->log_mutex);  	if (root->log_root) { -		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == -		    trans->transid) { +		if (btrfs_need_log_full_commit(root->fs_info, trans)) {  			ret = -EAGAIN;  			goto out;  		} -  		if (!root->log_start_pid) {  			root->log_start_pid = current->pid; -			root->log_multiple_pids = false; +			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);  		} else if (root->log_start_pid != current->pid) { -			root->log_multiple_pids = true; +			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);  		}  		atomic_inc(&root->log_batch); @@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,  		if (ret)  			goto out;  	} -	root->log_multiple_pids = false; +	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);  	root->log_start_pid = current->pid;  	atomic_inc(&root->log_batch);  	atomic_inc(&root->log_writers); @@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	while (1) {  		int batch = atomic_read(&root->log_batch);  		/* when we're on an ssd, just kick the log commit out */ -		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { +		if (!btrfs_test_opt(root, SSD) && +		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {  			mutex_unlock(&root->log_mutex);  			schedule_timeout_uninterruptible(1);  			mutex_lock(&root->log_mutex); @@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	}  	/* bail out if we need to do a full commit */ -	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == -	    trans->transid) { +	if (btrfs_need_log_full_commit(root->fs_info, trans)) {  		ret = -EAGAIN;  		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&root->log_mutex); @@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  		blk_finish_plug(&plug);  		btrfs_abort_transaction(trans, root, ret);  		btrfs_free_logged_extents(log, log_transid); -		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = -								trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		mutex_unlock(&root->log_mutex);  		goto out;  	} @@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  			list_del_init(&root_log_ctx.list);  		blk_finish_plug(&plug); -		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = -								trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans); +  		if (ret != -ENOSPC) {  			btrfs_abort_transaction(trans, root, ret);  			mutex_unlock(&log_root_tree->log_mutex); @@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	 * now that we've moved on to the tree of log tree roots,  	 * check the full commit flag again  	 */ -	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == -	    trans->transid) { +	if (btrfs_need_log_full_commit(root->fs_info, trans)) {  		blk_finish_plug(&plug);  		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);  		btrfs_free_logged_extents(log, log_transid); @@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  					 EXTENT_DIRTY | EXTENT_NEW);  	blk_finish_plug(&plug);  	if (ret) { -		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = -								trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		btrfs_abort_transaction(trans, root, ret);  		btrfs_free_logged_extents(log, log_transid);  		mutex_unlock(&log_root_tree->log_mutex); @@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	 */  	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);  	if (ret) { -		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = -								trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		btrfs_abort_transaction(trans, root, ret);  		goto out_wake_log_root;  	} @@ -2886,7 +2878,7 @@ fail:  out_unlock:  	mutex_unlock(&BTRFS_I(dir)->log_mutex);  	if (ret == -ENOSPC) { -		root->fs_info->last_trans_log_full_commit = trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		ret = 0;  	} else if (ret < 0)  		btrfs_abort_transaction(trans, root, ret); @@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,  				  dirid, &index);  	mutex_unlock(&BTRFS_I(inode)->log_mutex);  	if (ret == -ENOSPC) { -		root->fs_info->last_trans_log_full_commit = trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		ret = 0;  	} else if (ret < 0 && ret != -ENOENT)  		btrfs_abort_transaction(trans, root, ret); @@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,  			 * make sure any commits to the log are forced  			 * to be full commits  			 */ -			root->fs_info->last_trans_log_full_commit = -				trans->transid; +			btrfs_set_log_full_commit(root->fs_info, trans);  			ret = 1;  			break;  		} @@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  		goto end_no_trans;  	} +	/* +	 * The prev transaction commit doesn't complete, we need do +	 * full commit by ourselves. +	 */  	if (root->fs_info->last_trans_log_full_commit >  	    root->fs_info->last_trans_committed) {  		ret = 1; @@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,  end_trans:  	dput(old_parent);  	if (ret < 0) { -		root->fs_info->last_trans_log_full_commit = trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  		ret = 1;  	} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 91b145fce333..7f5b41bd5373 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@  #ifndef __TREE_LOG_  #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" +  /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */  #define BTRFS_NO_LOG_SYNC 256 @@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)  	INIT_LIST_HEAD(&ctx->list);  } +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, +					     struct btrfs_trans_handle *trans) +{ +	ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, +					     struct btrfs_trans_handle *trans) +{ +	return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == +		trans->transid; +} +  int btrfs_sync_log(struct btrfs_trans_handle *trans,  		   struct btrfs_root *root, struct btrfs_log_ctx *ctx);  int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d7fab73360..ffeed6d6326f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1452,6 +1452,22 @@ out:  	return ret;  } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ +	struct file *filp; + +	filp = filp_open(path_name, O_RDWR, 0); +	if (!filp) +		return; +	file_update_time(filp); +	filp_close(filp, NULL); +	return; +} +  static int btrfs_rm_dev_item(struct btrfs_root *root,  			     struct btrfs_device *device)  { @@ -1674,11 +1690,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		struct btrfs_fs_devices *fs_devices;  		fs_devices = root->fs_info->fs_devices;  		while (fs_devices) { -			if (fs_devices->seed == cur_devices) +			if (fs_devices->seed == cur_devices) { +				fs_devices->seed = cur_devices->seed;  				break; +			}  			fs_devices = fs_devices->seed;  		} -		fs_devices->seed = cur_devices->seed;  		cur_devices->seed = NULL;  		lock_chunks(root);  		__btrfs_close_devices(cur_devices); @@ -1694,20 +1711,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	 * remove it from the devices list and zero out the old super  	 */  	if (clear_super && disk_super) { +		u64 bytenr; +		int i; +  		/* make sure this device isn't detected as part of  		 * the FS anymore  		 */  		memset(&disk_super->magic, 0, sizeof(disk_super->magic));  		set_buffer_dirty(bh);  		sync_dirty_buffer(bh); + +		/* clear the mirror copies of super block on the disk +		 * being removed, 0th copy is been taken care above and +		 * the below would take of the rest +		 */ +		for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { +			bytenr = btrfs_sb_offset(i); +			if (bytenr + BTRFS_SUPER_INFO_SIZE >= +					i_size_read(bdev->bd_inode)) +				break; + +			brelse(bh); +			bh = __bread(bdev, bytenr / 4096, +					BTRFS_SUPER_INFO_SIZE); +			if (!bh) +				continue; + +			disk_super = (struct btrfs_super_block *)bh->b_data; + +			if (btrfs_super_bytenr(disk_super) != bytenr || +				btrfs_super_magic(disk_super) != BTRFS_MAGIC) { +				continue; +			} +			memset(&disk_super->magic, 0, +						sizeof(disk_super->magic)); +			set_buffer_dirty(bh); +			sync_dirty_buffer(bh); +		}  	}  	ret = 0; -	/* Notify udev that device has changed */ -	if (bdev) +	if (bdev) { +		/* Notify udev that device has changed */  		btrfs_kobject_uevent(bdev, KOBJ_CHANGE); +		/* Update ctime/mtime for device path for libblkid */ +		update_dev_time(device_path); +	} +  error_brelse:  	brelse(bh);  	if (bdev) @@ -1883,7 +1935,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)  	fs_devices->seeding = 0;  	fs_devices->num_devices = 0;  	fs_devices->open_devices = 0; -	fs_devices->total_devices = 0;  	fs_devices->seed = seed_devices;  	generate_random_uuid(fs_devices->fsid); @@ -2146,6 +2197,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  		ret = btrfs_commit_transaction(trans, root);  	} +	/* Update ctime/mtime for libblkid */ +	update_dev_time(device_path);  	return ret;  error_trans: @@ -2922,6 +2975,16 @@ static int should_balance_chunk(struct btrfs_root *root,  		return 0;  	} +	/* +	 * limited by count, must be the last filter +	 */ +	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { +		if (bargs->limit == 0) +			return 0; +		else +			bargs->limit--; +	} +  	return 1;  } @@ -2944,6 +3007,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  	int ret;  	int enospc_errors = 0;  	bool counting = true; +	u64 limit_data = bctl->data.limit; +	u64 limit_meta = bctl->meta.limit; +	u64 limit_sys = bctl->sys.limit;  	/* step one make some room on all the devices */  	devices = &fs_info->fs_devices->devices; @@ -2982,6 +3048,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  	memset(&bctl->stat, 0, sizeof(bctl->stat));  	spin_unlock(&fs_info->balance_lock);  again: +	if (!counting) { +		bctl->data.limit = limit_data; +		bctl->meta.limit = limit_meta; +		bctl->sys.limit = limit_sys; +	}  	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;  	key.offset = (u64)-1;  	key.type = BTRFS_CHUNK_ITEM_KEY; @@ -3881,7 +3952,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,  	u8 *ptr;  	array_size = btrfs_super_sys_array_size(super_copy); -	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) +	if (array_size + item_size + sizeof(disk_key) +			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)  		return -EFBIG;  	ptr = super_copy->sys_chunk_array + array_size; @@ -3986,6 +4058,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)  	btrfs_set_fs_incompat(info, RAID56);  } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)		\ +			- sizeof(struct btrfs_item)		\ +			- sizeof(struct btrfs_chunk))		\ +			/ sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\ +				- 2 * sizeof(struct btrfs_disk_key)	\ +				- 2 * sizeof(struct btrfs_chunk))	\ +				/ sizeof(struct btrfs_stripe) + 1) +  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			       struct btrfs_root *extent_root, u64 start,  			       u64 type) @@ -4035,6 +4117,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	if (type & BTRFS_BLOCK_GROUP_DATA) {  		max_stripe_size = 1024 * 1024 * 1024;  		max_chunk_size = 10 * max_stripe_size; +		if (!devs_max) +			devs_max = BTRFS_MAX_DEVS(info->chunk_root);  	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {  		/* for larger filesystems, use larger metadata chunks */  		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) @@ -4042,11 +4126,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		else  			max_stripe_size = 256 * 1024 * 1024;  		max_chunk_size = max_stripe_size; +		if (!devs_max) +			devs_max = BTRFS_MAX_DEVS(info->chunk_root);  	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {  		max_stripe_size = 32 * 1024 * 1024;  		max_chunk_size = 2 * max_stripe_size; +		if (!devs_max) +			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;  	} else { -		btrfs_err(info, "invalid chunk type 0x%llx requested\n", +		btrfs_err(info, "invalid chunk type 0x%llx requested",  		       type);  		BUG_ON(1);  	} @@ -4294,7 +4382,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  	if (em->start != chunk_offset || em->len != chunk_size) {  		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" -			  " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, +			  " %Lu-%Lu, found %Lu-%Lu", chunk_offset,  			  chunk_size, em->start, em->len);  		free_extent_map(em);  		return -EINVAL; @@ -4496,14 +4584,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  	 * and exit, so return 1 so the callers don't try to use other copies.  	 */  	if (!em) { -		btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, +		btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,  			    logical+len);  		return 1;  	}  	if (em->start > logical || em->start + em->len < logical) {  		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " -			    "%Lu-%Lu\n", logical, logical+len, em->start, +			    "%Lu-%Lu", logical, logical+len, em->start,  			    em->start + em->len);  		free_extent_map(em);  		return 1; @@ -4684,7 +4772,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  	if (em->start > logical || em->start + em->len < logical) {  		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " -			   "found %Lu-%Lu\n", logical, em->start, +			   "found %Lu-%Lu", logical, em->start,  			   em->start + em->len);  		free_extent_map(em);  		return -EINVAL; @@ -6058,10 +6146,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;  	struct btrfs_device *device; -	mutex_lock(&fs_devices->device_list_mutex); -	list_for_each_entry(device, &fs_devices->devices, dev_list) -		device->dev_root = fs_info->dev_root; -	mutex_unlock(&fs_devices->device_list_mutex); +	while (fs_devices) { +		mutex_lock(&fs_devices->device_list_mutex); +		list_for_each_entry(device, &fs_devices->devices, dev_list) +			device->dev_root = fs_info->dev_root; +		mutex_unlock(&fs_devices->device_list_mutex); + +		fs_devices = fs_devices->seed; +	}  }  static void __btrfs_reset_dev_stats(struct btrfs_device *dev) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80754f9dd3df..1a15bbeb65e2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -255,6 +255,7 @@ struct map_lookup {  #define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)  #define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)  #define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)  /*   * Profile changing flags.  When SOFT is set we won't relocate chunk if diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8e57191950cb..4f196314c0c1 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws,  	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {  		printk(KERN_WARNING "BTRFS: deflateInit failed\n"); -		ret = -1; +		ret = -EIO;  		goto out;  	} @@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,  	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);  	if (out_page == NULL) { -		ret = -1; +		ret = -ENOMEM;  		goto out;  	}  	cpage_out = kmap(out_page); @@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws,  			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",  			       ret);  			zlib_deflateEnd(&workspace->def_strm); -			ret = -1; +			ret = -EIO;  			goto out;  		} @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,  		if (workspace->def_strm.total_in > 8192 &&  		    workspace->def_strm.total_in <  		    workspace->def_strm.total_out) { -			ret = -1; +			ret = -EIO;  			goto out;  		}  		/* we need another page for writing out.  Test this @@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,  			kunmap(out_page);  			if (nr_pages == nr_dest_pages) {  				out_page = NULL; -				ret = -1; +				ret = -E2BIG;  				goto out;  			}  			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);  			if (out_page == NULL) { -				ret = -1; +				ret = -ENOMEM;  				goto out;  			}  			cpage_out = kmap(out_page); @@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,  	zlib_deflateEnd(&workspace->def_strm);  	if (ret != Z_STREAM_END) { -		ret = -1; +		ret = -EIO;  		goto out;  	}  	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { -		ret = -1; +		ret = -E2BIG;  		goto out;  	} @@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,  	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {  		printk(KERN_WARNING "BTRFS: inflateInit failed\n"); -		return -1; +		return -EIO;  	}  	while (workspace->inf_strm.total_in < srclen) {  		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); @@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,  		}  	}  	if (ret != Z_STREAM_END) -		ret = -1; +		ret = -EIO;  	else  		ret = 0;  done: @@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,  	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {  		printk(KERN_WARNING "BTRFS: inflateInit failed\n"); -		return -1; +		return -EIO;  	}  	while (bytes_left > 0) { @@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,  		total_out = workspace->inf_strm.total_out;  		if (total_out == buf_start) { -			ret = -1; +			ret = -EIO;  			break;  		} @@ -382,7 +382,7 @@ next:  	}  	if (ret != Z_STREAM_END && bytes_left != 0) -		ret = -1; +		ret = -EIO;  	else  		ret = 0; | 
