diff options
Diffstat (limited to 'fs')
41 files changed, 1309 insertions, 597 deletions
| diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 08405a3da6b1..b90cd3776f8e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,   */  static void __merge_refs(struct list_head *head, int mode)  { -	struct __prelim_ref *ref1; +	struct __prelim_ref *pos1; -	list_for_each_entry(ref1, head, list) { -		struct __prelim_ref *ref2 = ref1, *tmp; +	list_for_each_entry(pos1, head, list) { +		struct __prelim_ref *pos2 = pos1, *tmp; -		list_for_each_entry_safe_continue(ref2, tmp, head, list) { -			struct __prelim_ref *xchg; +		list_for_each_entry_safe_continue(pos2, tmp, head, list) { +			struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;  			struct extent_inode_elem *eie;  			if (!ref_for_same_block(ref1, ref2)) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97ad9bbeb35d..bfe4a337fb4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1614,7 +1614,7 @@ struct btrfs_fs_info {  	spinlock_t delayed_iput_lock;  	struct list_head delayed_iputs; -	struct rw_semaphore delayed_iput_sem; +	struct mutex cleaner_delayed_iput_mutex;  	/* this protects tree_mod_seq_list */  	spinlock_t tree_mod_seq_lock; @@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,  int __get_raid_index(u64 flags);  int btrfs_start_write_no_snapshoting(struct btrfs_root *root);  void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);  void check_system_chunk(struct btrfs_trans_handle *trans,  			struct btrfs_root *root,  			const u64 type); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..cbb7dbfb3fff 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(  		em = lookup_extent_mapping(em_tree, start, (u64)-1);  		if (!em)  			break; -		map = (struct map_lookup *)em->bdev; +		map = em->map_lookup;  		for (i = 0; i < map->num_stripes; i++)  			if (srcdev == map->stripes[i].dev)  				map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e99ccd6ffb2c..dd08e29f5117 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,12 @@  #include <asm/cpufeature.h>  #endif +#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\ +				 BTRFS_HEADER_FLAG_RELOC |\ +				 BTRFS_SUPER_FLAG_ERROR |\ +				 BTRFS_SUPER_FLAG_SEEDING |\ +				 BTRFS_SUPER_FLAG_METADUMP) +  static const struct extent_io_ops btree_extent_io_ops;  static void end_workqueue_fn(struct btrfs_work *work);  static void free_fs_root(struct btrfs_root *root); @@ -1583,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)  	ret = get_anon_bdev(&root->anon_dev);  	if (ret)  		goto free_writers; + +	mutex_lock(&root->objectid_mutex); +	ret = btrfs_find_highest_objectid(root, +					&root->highest_objectid); +	if (ret) { +		mutex_unlock(&root->objectid_mutex); +		goto free_root_dev; +	} + +	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + +	mutex_unlock(&root->objectid_mutex); +  	return 0; +free_root_dev: +	free_anon_bdev(root->anon_dev);  free_writers:  	btrfs_free_subvolume_writers(root->subv_writers);  fail: @@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg)  			goto sleep;  		} +		mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);  		btrfs_run_delayed_iputs(root); +		mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); +  		again = btrfs_clean_one_deleted_snapshot(root);  		mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb,  	mutex_init(&fs_info->delete_unused_bgs_mutex);  	mutex_init(&fs_info->reloc_mutex);  	mutex_init(&fs_info->delalloc_root_mutex); +	mutex_init(&fs_info->cleaner_delayed_iput_mutex);  	seqlock_init(&fs_info->profiles_lock); -	init_rwsem(&fs_info->delayed_iput_sem);  	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);  	INIT_LIST_HEAD(&fs_info->space_info); @@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb,  		goto fail_alloc;  	} -	/* -	 * Leafsize and nodesize were always equal, this is only a sanity check. -	 */ -	if (le32_to_cpu(disk_super->__unused_leafsize) != -	    btrfs_super_nodesize(disk_super)) { -		printk(KERN_ERR "BTRFS: couldn't mount because metadata " -		       "blocksizes don't match.  node %d leaf %d\n", -		       btrfs_super_nodesize(disk_super), -		       le32_to_cpu(disk_super->__unused_leafsize)); -		err = -EINVAL; -		goto fail_alloc; -	} -	if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { -		printk(KERN_ERR "BTRFS: couldn't mount because metadata " -		       "blocksize (%d) was too large\n", -		       btrfs_super_nodesize(disk_super)); -		err = -EINVAL; -		goto fail_alloc; -	} -  	features = btrfs_super_incompat_flags(disk_super);  	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;  	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb,  	sb->s_blocksize = sectorsize;  	sb->s_blocksize_bits = blksize_bits(sectorsize); -	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { -		printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); -		goto fail_sb_buffer; -	} - -	if (sectorsize != PAGE_SIZE) { -		printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " -		       "found on %s\n", (unsigned long)sectorsize, sb->s_id); -		goto fail_sb_buffer; -	} -  	mutex_lock(&fs_info->chunk_mutex);  	ret = btrfs_read_sys_array(tree_root);  	mutex_unlock(&fs_info->chunk_mutex); @@ -2915,6 +2908,18 @@ retry_root_backup:  	tree_root->commit_root = btrfs_root_node(tree_root);  	btrfs_set_root_refs(&tree_root->root_item, 1); +	mutex_lock(&tree_root->objectid_mutex); +	ret = btrfs_find_highest_objectid(tree_root, +					&tree_root->highest_objectid); +	if (ret) { +		mutex_unlock(&tree_root->objectid_mutex); +		goto recovery_tree_root; +	} + +	ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + +	mutex_unlock(&tree_root->objectid_mutex); +  	ret = btrfs_read_roots(fs_info, tree_root);  	if (ret)  		goto recovery_tree_root; @@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,  			      int read_only)  {  	struct btrfs_super_block *sb = fs_info->super_copy; +	u64 nodesize = btrfs_super_nodesize(sb); +	u64 sectorsize = btrfs_super_sectorsize(sb);  	int ret = 0; +	if (btrfs_super_magic(sb) != BTRFS_MAGIC) { +		printk(KERN_ERR "BTRFS: no valid FS found\n"); +		ret = -EINVAL; +	} +	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) +		printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", +				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);  	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {  		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",  				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,  	}  	/* -	 * The common minimum, we don't know if we can trust the nodesize/sectorsize -	 * items yet, they'll be verified later. Issue just a warning. +	 * Check sectorsize and nodesize first, other check will need it. +	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.  	 */ -	if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) +	if (!is_power_of_2(sectorsize) || sectorsize < 4096 || +	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { +		printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); +		ret = -EINVAL; +	} +	/* Only PAGE SIZE is supported yet */ +	if (sectorsize != PAGE_CACHE_SIZE) { +		printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", +				sectorsize, PAGE_CACHE_SIZE); +		ret = -EINVAL; +	} +	if (!is_power_of_2(nodesize) || nodesize < sectorsize || +	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { +		printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); +		ret = -EINVAL; +	} +	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { +		printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", +				le32_to_cpu(sb->__unused_leafsize), +				nodesize); +		ret = -EINVAL; +	} + +	/* Root alignment check */ +	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {  		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",  				btrfs_super_root(sb)); -	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) +		ret = -EINVAL; +	} +	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {  		printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",  				btrfs_super_chunk_root(sb)); -	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) -		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", -				btrfs_super_log_root(sb)); - -	/* -	 * Check the lower bound, the alignment and other constraints are -	 * checked later. -	 */ -	if (btrfs_super_nodesize(sb) < 4096) { -		printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", -				btrfs_super_nodesize(sb));  		ret = -EINVAL;  	} -	if (btrfs_super_sectorsize(sb) < 4096) { -		printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", -				btrfs_super_sectorsize(sb)); +	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { +		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", +				btrfs_super_log_root(sb));  		ret = -EINVAL;  	} diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60cc1399c64f..e2287c7c10be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4139,8 +4139,10 @@ commit_trans:  		    !atomic_read(&root->fs_info->open_ioctl_trans)) {  			need_commit--; -			if (need_commit > 0) +			if (need_commit > 0) { +				btrfs_start_delalloc_roots(fs_info, 0, -1);  				btrfs_wait_ordered_roots(fs_info, -1); +			}  			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans)) @@ -4153,11 +4155,12 @@ commit_trans:  				if (ret)  					return ret;  				/* -				 * make sure that all running delayed iput are -				 * done +				 * The cleaner kthread might still be doing iput +				 * operations. Wait for it to finish so that +				 * more space is released.  				 */ -				down_write(&root->fs_info->delayed_iput_sem); -				up_write(&root->fs_info->delayed_iput_sem); +				mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); +				mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);  				goto again;  			} else {  				btrfs_end_transaction(trans, root); @@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,  	 * more device items and remove one chunk item), but this is done at  	 * btrfs_remove_chunk() through a call to check_system_chunk().  	 */ -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	num_items = 3 + map->num_stripes;  	free_extent_map(em); @@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)  	disk_super = fs_info->super_copy;  	if (!btrfs_super_root(disk_super)) -		return 1; +		return -EINVAL;  	features = btrfs_super_incompat_flags(disk_super);  	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)  	}  	return 1;  } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ +	schedule(); +	return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ +	while (true) { +		int ret; + +		ret = btrfs_start_write_no_snapshoting(root); +		if (ret) +			break; +		wait_on_atomic_t(&root->will_be_snapshoted, +				 wait_snapshoting_atomic_t, +				 TASK_UNINTERRUPTIBLE); +	} +} diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)  		WARN_ON(extent_map_in_tree(em));  		WARN_ON(!list_empty(&em->list));  		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) -			kfree(em->bdev); +			kfree(em->map_lookup);  		kmem_cache_free(extent_map_cache, em);  	}  } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map {  	u64 block_len;  	u64 generation;  	unsigned long flags; -	struct block_device *bdev; +	union { +		struct block_device *bdev; + +		/* +		 * used for chunk mappings +		 * flags & EXTENT_FLAG_FS_MAPPING must be set +		 */ +		struct map_lookup *map_lookup; +	};  	atomic_t refs;  	unsigned int compress_type;  	struct list_head list; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83d7859d7619..9f5cc1e8e126 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  /* simple helper to fault in pages and copy.  This should go away   * and be replaced with calls into generic code.   */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, -					 size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,  					 struct page **prepared_pages,  					 struct iov_iter *i)  { @@ -1588,8 +1587,7 @@ again:  			ret = 0;  		} -		copied = btrfs_copy_from_user(pos, num_pages, -					   write_bytes, pages, i); +		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);  		/*  		 * if we have trouble faulting in the pages, fall diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8b57c17b3fb3..e50316c4af15 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out:  	return ret;  } -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)  {  	struct btrfs_path *path;  	int ret; @@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)  	int ret;  	mutex_lock(&root->objectid_mutex); -	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { -		ret = btrfs_find_highest_objectid(root, -						  &root->highest_objectid); -		if (ret) -			goto out; -	} -  	if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {  		ret = -ENOSPC;  		goto out; diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index ddb347bfee23..c8e864b2d530 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,  			 struct btrfs_trans_handle *trans);  int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);  #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 247830107686..1b79dc9b12e4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  {  	struct btrfs_fs_info *fs_info = root->fs_info; -	down_read(&fs_info->delayed_iput_sem);  	spin_lock(&fs_info->delayed_iput_lock);  	while (!list_empty(&fs_info->delayed_iputs)) {  		struct btrfs_inode *inode; @@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  		spin_lock(&fs_info->delayed_iput_lock);  	}  	spin_unlock(&fs_info->delayed_iput_lock); -	up_read(&root->fs_info->delayed_iput_sem);  }  /* @@ -4874,26 +4872,6 @@ next:  	return err;  } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ -	schedule(); -	return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ -	while (true) { -		int ret; - -		ret = btrfs_start_write_no_snapshoting(root); -		if (ret) -			break; -		wait_on_atomic_t(&root->will_be_snapshoted, -				 wait_snapshoting_atomic_t, -				 TASK_UNINTERRUPTIBLE); -	} -} -  static int btrfs_setsize(struct inode *inode, struct iattr *attr)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  		 * truncation, it must capture all writes that happened before  		 * this truncation.  		 */ -		wait_for_snapshot_creation(root); +		btrfs_wait_for_snapshot_creation(root);  		ret = btrfs_cont_expand(inode, oldsize, newsize);  		if (ret) {  			btrfs_end_write_no_snapshoting(root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a47a3148ec8..9028737ee9b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,  		goto fail;  	} +	mutex_lock(&new_root->objectid_mutex); +	new_root->highest_objectid = new_dirid; +	mutex_unlock(&new_root->objectid_mutex); +  	/*  	 * insert the directory item  	 */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6d707545f775..55161369fab1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,  	return 1;  } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, +				  int index) +{ +	return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, +				     int index) +{ +	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} +  /*   * helper to index into the pstripe   */  static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)  { -	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; -	return rbio->stripe_pages[index]; +	return rbio_stripe_page(rbio, rbio->nr_data, index);  }  /* @@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)  {  	if (rbio->nr_data + 1 == rbio->real_stripes)  		return NULL; - -	index += ((rbio->nr_data + 1) * rbio->stripe_len) >> -		PAGE_CACHE_SHIFT; -	return rbio->stripe_pages[index]; +	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);  }  /* @@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)  {  	struct btrfs_raid_bio *rbio = bio->bi_private;  	int err = bio->bi_error; +	int max_errors;  	if (err)  		fail_bio_stripe(rbio, bio); @@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio)  	err = 0;  	/* OK, we have read all the stripes we need to. */ -	if (atomic_read(&rbio->error) > rbio->bbio->max_errors) +	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? +		     0 : rbio->bbio->max_errors; +	if (atomic_read(&rbio->error) > max_errors)  		err = -EIO;  	rbio_orig_end_io(rbio, err); @@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,   */  static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)  { -	unsigned long nr = stripe_len * nr_stripes; -	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); +	return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;  }  /* @@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,  	void *p;  	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + -		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), -			GFP_NOFS); +		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * +		       sizeof(long), GFP_NOFS);  	if (!rbio)  		return ERR_PTR(-ENOMEM); @@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)  		if (!page)  			return -ENOMEM;  		rbio->stripe_pages[i] = page; -		ClearPageUptodate(page);  	}  	return 0;  } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */  static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)  {  	int i;  	struct page *page; -	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; +	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);  	for (; i < rbio->nr_pages; i++) {  		if (rbio->stripe_pages[i]) @@ -1121,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)  }  /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ -	int index; -	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); -	index += page; -	return rbio->stripe_pages[index]; -} - -/*   * helper function to walk our bio list and populate the bio_pages array with   * the result.  This seems expensive, but it is faster than constantly   * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  {  	struct btrfs_bio *bbio = rbio->bbio;  	void *pointers[rbio->real_stripes]; -	int stripe_len = rbio->stripe_len;  	int nr_data = rbio->nr_data;  	int stripe;  	int pagenr; @@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  	int q_stripe = -1;  	struct bio_list bio_list;  	struct bio *bio; -	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;  	int ret;  	bio_list_init(&bio_list); @@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  	else  		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); -	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  		struct page *p;  		/* first collect one page from each data stripe */  		for (stripe = 0; stripe < nr_data; stripe++) { @@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  	 * everything else.  	 */  	for (stripe = 0; stripe < rbio->real_stripes; stripe++) { -		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  			struct page *page;  			if (stripe < rbio->nr_data) {  				page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  		if (!bbio->tgtdev_map[stripe])  			continue; -		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { +		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  			struct page *page;  			if (stripe < rbio->nr_data) {  				page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)  	int bios_to_read = 0;  	struct bio_list bio_list;  	int ret; -	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);  	int pagenr;  	int stripe;  	struct bio *bio; @@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)  	 * stripe  	 */  	for (stripe = 0; stripe < rbio->nr_data; stripe++) { -		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  			struct page *page;  			/*  			 * we want to find all the pages missing from @@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)  	int pagenr, stripe;  	void **pointers;  	int faila = -1, failb = -1; -	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);  	struct page *page;  	int err;  	int i; @@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)  	index_rbio_pages(rbio); -	for (pagenr = 0; pagenr < nr_pages; pagenr++) { +	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  		/*  		 * Now we just use bitmap to mark the horizontal stripes in  		 * which we have data when doing parity scrub. @@ -1935,7 +1932,7 @@ pstripe:  		 * other endio functions will fiddle the uptodate bits  		 */  		if (rbio->operation == BTRFS_RBIO_WRITE) { -			for (i = 0;  i < nr_pages; i++) { +			for (i = 0;  i < rbio->stripe_npages; i++) {  				if (faila != -1) {  					page = rbio_stripe_page(rbio, faila, i);  					SetPageUptodate(page); @@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)  	int bios_to_read = 0;  	struct bio_list bio_list;  	int ret; -	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);  	int pagenr;  	int stripe;  	struct bio *bio; @@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)  			continue;  		} -		for (pagenr = 0; pagenr < nr_pages; pagenr++) { +		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {  			struct page *p;  			/* @@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)  			if (!page)  				return -ENOMEM;  			rbio->stripe_pages[index] = page; -			ClearPageUptodate(page);  		}  	}  	return 0;  } -/* - * end io function used by finish_rmw.  When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ -	struct btrfs_raid_bio *rbio = bio->bi_private; -	int err = bio->bi_error; - -	if (bio->bi_error) -		fail_bio_stripe(rbio, bio); - -	bio_put(bio); - -	if (!atomic_dec_and_test(&rbio->stripes_pending)) -		return; - -	err = 0; - -	if (atomic_read(&rbio->error)) -		err = -EIO; - -	rbio_orig_end_io(rbio, err); -} -  static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,  					 int need_check)  { @@ -2462,7 +2432,7 @@ submit_write:  			break;  		bio->bi_private = rbio; -		bio->bi_end_io = raid_write_parity_end_io; +		bio->bi_end_io = raid_write_end_io;  		submit_bio(WRITE, bio);  	}  	return; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c981ebe2acb..b1a68530e911 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2813,7 +2813,7 @@ out:  static inline int scrub_calc_parity_bitmap_len(int nsectors)  { -	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); +	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);  }  static void scrub_parity_get(struct scrub_parity *sparity) @@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,  		return ret;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	if (em->start != chunk_offset)  		goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b9eab6d048e..d41e09fe8e38 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  	int ret = 0;  	char *compress_type;  	bool compress_force = false; +	enum btrfs_compression_type saved_compress_type; +	bool saved_compress_force; +	int no_compress = 0;  	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);  	if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) @@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  			/* Fallthrough */  		case Opt_compress:  		case Opt_compress_type: +			saved_compress_type = btrfs_test_opt(root, COMPRESS) ? +				info->compress_type : BTRFS_COMPRESS_NONE; +			saved_compress_force = +				btrfs_test_opt(root, FORCE_COMPRESS);  			if (token == Opt_compress ||  			    token == Opt_compress_force ||  			    strcmp(args[0].from, "zlib") == 0) { @@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  				btrfs_set_opt(info->mount_opt, COMPRESS);  				btrfs_clear_opt(info->mount_opt, NODATACOW);  				btrfs_clear_opt(info->mount_opt, NODATASUM); +				no_compress = 0;  			} else if (strcmp(args[0].from, "lzo") == 0) {  				compress_type = "lzo";  				info->compress_type = BTRFS_COMPRESS_LZO; @@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  				btrfs_clear_opt(info->mount_opt, NODATACOW);  				btrfs_clear_opt(info->mount_opt, NODATASUM);  				btrfs_set_fs_incompat(info, COMPRESS_LZO); +				no_compress = 0;  			} else if (strncmp(args[0].from, "no", 2) == 0) {  				compress_type = "no";  				btrfs_clear_opt(info->mount_opt, COMPRESS);  				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);  				compress_force = false; +				no_compress++;  			} else {  				ret = -EINVAL;  				goto out;  			}  			if (compress_force) { -				btrfs_set_and_info(root, FORCE_COMPRESS, -						   "force %s compression", -						   compress_type); +				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);  			} else { -				if (!btrfs_test_opt(root, COMPRESS)) -					btrfs_info(root->fs_info, -						   "btrfs: use %s compression", -						   compress_type);  				/*  				 * If we remount from compress-force=xxx to  				 * compress=xxx, we need clear FORCE_COMPRESS @@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)  				 */  				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);  			} +			if ((btrfs_test_opt(root, COMPRESS) && +			     (info->compress_type != saved_compress_type || +			      compress_force != saved_compress_force)) || +			    (!btrfs_test_opt(root, COMPRESS) && +			     no_compress == 1)) { +				btrfs_info(root->fs_info, +					   "%s %s compression", +					   (compress_force) ? "force" : "use", +					   compress_type); +			} +			compress_force = false;  			break;  		case Opt_ssd:  			btrfs_set_and_info(root, SSD, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32abbca9d77..366b335946fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  	},  }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {  	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,  	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,  	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP, @@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void)  	spin_lock_init(&dev->reada_lock);  	atomic_set(&dev->reada_in_flight, 0);  	atomic_set(&dev->dev_stats_ccnt, 0); +	btrfs_device_data_ordered_init(dev);  	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);  	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); @@ -1183,7 +1184,7 @@ again:  		struct map_lookup *map;  		int i; -		map = (struct map_lookup *)em->bdev; +		map = em->map_lookup;  		for (i = 0; i < map->num_stripes; i++) {  			u64 end; @@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,  			free_extent_map(em);  		return -EINVAL;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	lock_chunks(root->fs_info->chunk_root);  	check_system_chunk(trans, extent_root, map->type);  	unlock_chunks(root->fs_info->chunk_root); @@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <  		btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {  		btrfs_warn(fs_info, -	"metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", +	"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",  			bctl->meta.target, bctl->data.target);  	} @@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		goto error;  	}  	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); -	em->bdev = (struct block_device *)map; +	em->map_lookup = map;  	em->start = start;  	em->len = num_bytes;  	em->block_start = 0; @@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  		return -EINVAL;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	item_size = btrfs_chunk_item_size(map->num_stripes);  	stripe_size = em->orig_block_len; @@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)  	if (!em)  		return 1; -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	for (i = 0; i < map->num_stripes; i++) {  		if (map->stripes[i].dev->missing) {  			miss_ndevs++; @@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  		return 1;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))  		ret = map->num_stripes;  	else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,  	BUG_ON(!em);  	BUG_ON(em->start > logical || em->start + em->len < logical); -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)  		len = map->stripe_len * nr_data_stripes(map);  	free_extent_map(em); @@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,  	BUG_ON(!em);  	BUG_ON(em->start > logical || em->start + em->len < logical); -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)  		ret = 1;  	free_extent_map(em); @@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		return -EINVAL;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	offset = logical - em->start;  	stripe_len = map->stripe_len; @@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		 * target drive.  		 */  		for (i = 0; i < tmp_num_stripes; i++) { -			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { -				/* -				 * In case of DUP, in order to keep it -				 * simple, only add the mirror with the -				 * lowest physical address -				 */ -				if (found && -				    physical_of_found <= -				     tmp_bbio->stripes[i].physical) -					continue; -				index_srcdev = i; -				found = 1; -				physical_of_found = -					tmp_bbio->stripes[i].physical; -			} +			if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) +				continue; + +			/* +			 * In case of DUP, in order to keep it simple, only add +			 * the mirror with the lowest physical address +			 */ +			if (found && +			    physical_of_found <= tmp_bbio->stripes[i].physical) +				continue; + +			index_srcdev = i; +			found = 1; +			physical_of_found = tmp_bbio->stripes[i].physical;  		} -		if (found) { -			mirror_num = index_srcdev + 1; -			patch_the_first_stripe_for_dev_replace = 1; -			physical_to_patch_in_first_stripe = physical_of_found; -		} else { +		btrfs_put_bbio(tmp_bbio); + +		if (!found) {  			WARN_ON(1);  			ret = -EIO; -			btrfs_put_bbio(tmp_bbio);  			goto out;  		} -		btrfs_put_bbio(tmp_bbio); +		mirror_num = index_srcdev + 1; +		patch_the_first_stripe_for_dev_replace = 1; +		physical_to_patch_in_first_stripe = physical_of_found;  	} else if (mirror_num > map->num_stripes) {  		mirror_num = 0;  	} @@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  		free_extent_map(em);  		return -EIO;  	} -	map = (struct map_lookup *)em->bdev; +	map = em->map_lookup;  	length = em->len;  	rmap_len = map->stripe_len; @@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	bbio->fs_info = root->fs_info;  	atomic_set(&bbio->stripes_pending, bbio->num_stripes); -	if (bbio->raid_map) { +	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && +	    ((rw & WRITE) || (mirror_num > 1))) {  		/* In this case, map_length has been set to the length of  		   a single stripe; not the whole write */  		if (rw & WRITE) { @@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	struct extent_map *em;  	u64 logical;  	u64 length; +	u64 stripe_len;  	u64 devid;  	u8 uuid[BTRFS_UUID_SIZE];  	int num_stripes; @@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	logical = key->offset;  	length = btrfs_chunk_length(leaf, chunk); +	stripe_len = btrfs_chunk_stripe_len(leaf, chunk); +	num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +	/* Validation check */ +	if (!num_stripes) { +		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", +			  num_stripes); +		return -EIO; +	} +	if (!IS_ALIGNED(logical, root->sectorsize)) { +		btrfs_err(root->fs_info, +			  "invalid chunk logical %llu", logical); +		return -EIO; +	} +	if (!length || !IS_ALIGNED(length, root->sectorsize)) { +		btrfs_err(root->fs_info, +			"invalid chunk length %llu", length); +		return -EIO; +	} +	if (!is_power_of_2(stripe_len)) { +		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", +			  stripe_len); +		return -EIO; +	} +	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & +	    btrfs_chunk_type(leaf, chunk)) { +		btrfs_err(root->fs_info, "unrecognized chunk type: %llu", +			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK | +			    BTRFS_BLOCK_GROUP_PROFILE_MASK) & +			  btrfs_chunk_type(leaf, chunk)); +		return -EIO; +	}  	read_lock(&map_tree->map_tree.lock);  	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	em = alloc_extent_map();  	if (!em)  		return -ENOMEM; -	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);  	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);  	if (!map) {  		free_extent_map(em); @@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	}  	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); -	em->bdev = (struct block_device *)map; +	em->map_lookup = map;  	em->start = logical;  	em->len = length;  	em->orig_start = 0; @@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,  	/* In order to kick the device replace finish process */  	lock_chunks(root);  	list_for_each_entry(em, &transaction->pending_chunks, list) { -		map = (struct map_lookup *)em->bdev; +		map = em->map_lookup;  		for (i = 0; i < map->num_stripes; i++) {  			dev = map->stripes[i].dev; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1925d6d222b8..58c2f4a21b7f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,  		return -EINVAL;  	kbuf = memdup_user_nul(buf, count); -	if (!IS_ERR(kbuf)) +	if (IS_ERR(kbuf))  		return PTR_ERR(kbuf);  	if (check_version(kbuf)) { diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073663..c8021208a7eb 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page)  				EXT4_DECRYPT, page->index, page, page);  } -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, +			   ext4_fsblk_t pblk, ext4_lblk_t len)  {  	struct ext4_crypto_ctx	*ctx;  	struct page		*ciphertext_page = NULL;  	struct bio		*bio; -	ext4_lblk_t		lblk = le32_to_cpu(ex->ee_block); -	ext4_fsblk_t		pblk = ext4_ext_pblock(ex); -	unsigned int		len = ext4_ext_get_actual_len(ex);  	int			ret, err = 0;  #if 0 diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c5882b36e558..9a16d1e75a49 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -213,9 +213,11 @@ retry:  		res = -ENOKEY;  		goto out;  	} +	down_read(&keyring_key->sem);  	ukp = user_key_payload(keyring_key);  	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {  		res = -EINVAL; +		up_read(&keyring_key->sem);  		goto out;  	}  	master_key = (struct ext4_encryption_key *)ukp->data; @@ -226,10 +228,12 @@ retry:  			    "ext4: key size incorrect: %d\n",  			    master_key->size);  		res = -ENOKEY; +		up_read(&keyring_key->sem);  		goto out;  	}  	res = ext4_derive_key_aes(ctx.nonce, master_key->raw,  				  raw_key); +	up_read(&keyring_key->sem);  	if (res)  		goto out;  got_key: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87144..1c127213363a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -378,14 +378,22 @@ struct flex_groups {  #define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */  #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */ + +#define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \ +					 EXT4_IMMUTABLE_FL | \ +					 EXT4_APPEND_FL | \ +					 EXT4_NODUMP_FL | \ +					 EXT4_NOATIME_FL | \ +					 EXT4_PROJINHERIT_FL)  /* Flags that should be inherited by new inodes from their parent. */  #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\  			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\  			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ -			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) +			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ +			   EXT4_PROJINHERIT_FL)  /* Flags that are appropriate for regular files (all but dir-specific ones). */  #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -555,10 +563,12 @@ enum {  #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040  	/* Request will not result in inode size update (user for fallocate) */  #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080 -	/* Do not take i_data_sem locking in ext4_map_blocks */ -#define EXT4_GET_BLOCKS_NO_LOCK			0x0100  	/* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0100 +	/* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO			0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\ +					EXT4_GET_BLOCKS_ZERO)  /*   * The bit position of these flags must not overlap with any of the @@ -616,6 +626,46 @@ enum {  #define EXT4_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])  #define EXT4_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct ext4_encryption_policy) +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { +	__u32		fsx_xflags;	/* xflags field value (get/set) */ +	__u32		fsx_extsize;	/* extsize field value (get/set)*/ +	__u32		fsx_nextents;	/* nextents field value (get)	*/ +	__u32		fsx_projid;	/* project identifier (get/set) */ +	unsigned char	fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */ +#define FS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */ +#define FS_XFLAG_APPEND		0x00000010	/* all writes append */ +#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */ +#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */ +#define FS_XFLAG_NODUMP		0x00000080	/* do not include in backups */ +#define FS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */ +#define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */ +#define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR +  #if defined(__KERNEL__) && defined(CONFIG_COMPAT)  /*   * ioctl commands in 32 bit emulation @@ -910,6 +960,15 @@ struct ext4_inode_info {  	 * by other means, so we have i_data_sem.  	 */  	struct rw_semaphore i_data_sem; +	/* +	 * i_mmap_sem is for serializing page faults with truncate / punch hole +	 * operations. We have to make sure that new page cannot be faulted in +	 * a section of the inode that is being punched. We cannot easily use +	 * i_data_sem for this since we need protection for the whole punch +	 * operation and i_data_sem ranks below transaction start so we have +	 * to occasionally drop it. +	 */ +	struct rw_semaphore i_mmap_sem;  	struct inode vfs_inode;  	struct jbd2_inode *jinode; @@ -993,6 +1052,7 @@ struct ext4_inode_info {  	/* Encryption params */  	struct ext4_crypt_info *i_crypt_info;  #endif +	kprojid_t i_projid;  };  /* @@ -1248,7 +1308,7 @@ struct ext4_super_block {  #endif  /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3  /*   * fourth extended-fs super-block data in memory @@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)  					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\  					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\  					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ -					 EXT4_FEATURE_RO_COMPAT_QUOTA) +					 EXT4_FEATURE_RO_COMPAT_QUOTA |\ +					 EXT4_FEATURE_RO_COMPAT_PROJECT)  #define EXTN_FEATURE_FUNCS(ver) \  static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)  #define	EXT4_DEF_RESUID		0  #define	EXT4_DEF_RESGID		0 +/* + * Default project ID + */ +#define	EXT4_DEF_PROJID		0 +  #define EXT4_DEF_INODE_READAHEAD_BLKS	32  /* @@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page);  struct page *ext4_encrypt(struct inode *inode,  			  struct page *plaintext_page);  int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, +			   ext4_fsblk_t pblk, ext4_lblk_t len);  #ifdef CONFIG_EXT4_FS_ENCRYPTION  int ext4_init_crypto(void); @@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);  struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);  int ext4_get_block_write(struct inode *inode, sector_t iblock,  			 struct buffer_head *bh_result, int create); -int ext4_get_block_dax(struct inode *inode, sector_t iblock, -			 struct buffer_head *bh_result, int create); +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, +			    struct buffer_head *bh_result, int create);  int ext4_get_block(struct inode *inode, sector_t iblock,  				struct buffer_head *bh_result, int create);  int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, @@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,  			     loff_t lstart, loff_t lend);  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);  extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);  extern void ext4_da_update_reserve_space(struct inode *inode,  					int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, +			      ext4_fsblk_t pblk, ext4_lblk_t len);  /* indirect.c */  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)  	return changed;  } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, +				      loff_t len); +  struct ext4_group_info {  	unsigned long   bb_state;  	struct rb_root  bb_free_root; @@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,  					 struct page *page);  extern int ext4_try_add_inline_entry(handle_t *handle,  				     struct ext4_filename *fname, -				     struct dentry *dentry, -				     struct inode *inode); +				     struct inode *dir, struct inode *inode);  extern int ext4_try_create_inline_dir(handle_t *handle,  				      struct inode *parent,  				      struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b17a..b52fea3b7219 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)  {  	ext4_fsblk_t ee_pblock;  	unsigned int ee_len; -	int ret;  	ee_len    = ext4_ext_get_actual_len(ex);  	ee_pblock = ext4_ext_pblock(ex); - -	if (ext4_encrypted_inode(inode)) -		return ext4_encrypted_zeroout(inode, ex); - -	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); -	if (ret > 0) -		ret = 0; - -	return ret; +	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, +				  ee_len);  }  /* @@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,  	}  	/* IO end_io complete, convert the filled extent to written */  	if (flags & EXT4_GET_BLOCKS_CONVERT) { +		if (flags & EXT4_GET_BLOCKS_ZERO) { +			if (allocated > map->m_len) +				allocated = map->m_len; +			err = ext4_issue_zeroout(inode, map->m_lblk, newblock, +						 allocated); +			if (err < 0) +				goto out2; +		}  		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,  							   ppath);  		if (ret >= 0) { @@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,  	if (len <= EXT_UNWRITTEN_MAX_LEN)  		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -	/* Wait all existing dio workers, newcomers will block on i_mutex */ -	ext4_inode_block_unlocked_dio(inode); -	inode_dio_wait(inode); -  	/*  	 * credits to insert 1 extent into extent tree  	 */ @@ -4752,8 +4748,6 @@ retry:  		goto retry;  	} -	ext4_inode_resume_unlocked_dio(inode); -  	return ret > 0 ? ret2 : ret;  } @@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,  	int partial_begin, partial_end;  	loff_t start, end;  	ext4_lblk_t lblk; -	struct address_space *mapping = inode->i_mapping;  	unsigned int blkbits = inode->i_blkbits;  	trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,  	}  	/* -	 * Write out all dirty pages to avoid race conditions -	 * Then release them. -	 */ -	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { -		ret = filemap_write_and_wait_range(mapping, offset, -						   offset + len - 1); -		if (ret) -			return ret; -	} - -	/*  	 * Round up offset. This is not fallocate, we neet to zero out  	 * blocks, so convert interior block aligned part of the range to  	 * unwritten and possibly manually zero out unaligned parts of the @@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,  	if (mode & FALLOC_FL_KEEP_SIZE)  		flags |= EXT4_GET_BLOCKS_KEEP_SIZE; +	/* Wait all existing dio workers, newcomers will block on i_mutex */ +	ext4_inode_block_unlocked_dio(inode); +	inode_dio_wait(inode); +  	/* Preallocate the range including the unaligned edges */  	if (partial_begin || partial_end) {  		ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,  				 round_down(offset, 1 << blkbits)) >> blkbits,  				new_size, flags, mode);  		if (ret) -			goto out_mutex; +			goto out_dio;  	} @@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,  		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |  			  EXT4_EX_NOCACHE); -		/* Now release the pages and zero block aligned part of pages*/ +		/* +		 * Prevent page faults from reinstantiating pages we have +		 * released from page cache. +		 */ +		down_write(&EXT4_I(inode)->i_mmap_sem); +		ret = ext4_update_disksize_before_punch(inode, offset, len); +		if (ret) { +			up_write(&EXT4_I(inode)->i_mmap_sem); +			goto out_dio; +		} +		/* Now release the pages and zero block aligned part of pages */  		truncate_pagecache_range(inode, start, end - 1);  		inode->i_mtime = inode->i_ctime = ext4_current_time(inode); -		/* Wait all existing dio workers, newcomers will block on i_mutex */ -		ext4_inode_block_unlocked_dio(inode); -		inode_dio_wait(inode); -  		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,  					     flags, mode); +		up_write(&EXT4_I(inode)->i_mmap_sem);  		if (ret)  			goto out_dio;  	} @@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  			goto out;  	} +	/* Wait all existing dio workers, newcomers will block on i_mutex */ +	ext4_inode_block_unlocked_dio(inode); +	inode_dio_wait(inode); +  	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,  				     flags, mode); +	ext4_inode_resume_unlocked_dio(inode);  	if (ret)  		goto out; @@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)  			return ret;  	} -	/* -	 * Need to round down offset to be aligned with page size boundary -	 * for page size > block size. -	 */ -	ioffset = round_down(offset, PAGE_SIZE); - -	/* Write out all dirty pages */ -	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -					   LLONG_MAX); -	if (ret) -		return ret; - -	/* Take mutex lock */  	mutex_lock(&inode->i_mutex); -  	/*  	 * There is no need to overlap collapse range with EOF, in which case  	 * it is effectively a truncate operation @@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)  		goto out_mutex;  	} -	truncate_pagecache(inode, ioffset); -  	/* Wait for existing dio to complete */  	ext4_inode_block_unlocked_dio(inode);  	inode_dio_wait(inode); +	/* +	 * Prevent page faults from reinstantiating pages we have released from +	 * page cache. +	 */ +	down_write(&EXT4_I(inode)->i_mmap_sem); +	/* +	 * Need to round down offset to be aligned with page size boundary +	 * for page size > block size. +	 */ +	ioffset = round_down(offset, PAGE_SIZE); +	/* +	 * Write tail of the last page before removed range since it will get +	 * removed from the page cache below. +	 */ +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); +	if (ret) +		goto out_mmap; +	/* +	 * Write data that will be shifted to preserve them when discarding +	 * page cache below. We are also protected from pages becoming dirty +	 * by i_mmap_sem. +	 */ +	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, +					   LLONG_MAX); +	if (ret) +		goto out_mmap; +	truncate_pagecache(inode, ioffset); +  	credits = ext4_writepage_trans_blocks(inode);  	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle); -		goto out_dio; +		goto out_mmap;  	}  	down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,7 +5583,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)  out_stop:  	ext4_journal_stop(handle); -out_dio: +out_mmap: +	up_write(&EXT4_I(inode)->i_mmap_sem);  	ext4_inode_resume_unlocked_dio(inode);  out_mutex:  	mutex_unlock(&inode->i_mutex); @@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)  			return ret;  	} -	/* -	 * Need to round down to align start offset to page size boundary -	 * for page size > block size. -	 */ -	ioffset = round_down(offset, PAGE_SIZE); - -	/* Write out all dirty pages */ -	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -			LLONG_MAX); -	if (ret) -		return ret; - -	/* Take mutex lock */  	mutex_lock(&inode->i_mutex); -  	/* Currently just for extent based files */  	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		ret = -EOPNOTSUPP; @@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)  		goto out_mutex;  	} -	truncate_pagecache(inode, ioffset); -  	/* Wait for existing dio to complete */  	ext4_inode_block_unlocked_dio(inode);  	inode_dio_wait(inode); +	/* +	 * Prevent page faults from reinstantiating pages we have released from +	 * page cache. +	 */ +	down_write(&EXT4_I(inode)->i_mmap_sem); +	/* +	 * Need to round down to align start offset to page size boundary +	 * for page size > block size. +	 */ +	ioffset = round_down(offset, PAGE_SIZE); +	/* Write out all dirty pages */ +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +			LLONG_MAX); +	if (ret) +		goto out_mmap; +	truncate_pagecache(inode, ioffset); +  	credits = ext4_writepage_trans_blocks(inode);  	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle); -		goto out_dio; +		goto out_mmap;  	}  	/* Expand file to avoid data loss if there is error while shifting */ @@ -5741,7 +5753,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)  out_stop:  	ext4_journal_stop(handle); -out_dio: +out_mmap: +	up_write(&EXT4_I(inode)->i_mmap_sem);  	ext4_inode_resume_unlocked_dio(inode);  out_mutex:  	mutex_unlock(&inode->i_mutex); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7ba98..749b222e6498 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -193,43 +193,35 @@ out:  }  #ifdef CONFIG_FS_DAX -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ -	struct inode *inode = bh->b_assoc_map->host; -	/* XXX: breaks on 32-bit > 16TB. Is that even supported? */ -	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; -	int err; -	if (!uptodate) -		return; -	WARN_ON(!buffer_unwritten(bh)); -	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} -  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	int result;  	handle_t *handle = NULL; -	struct super_block *sb = file_inode(vma->vm_file)->i_sb; +	struct inode *inode = file_inode(vma->vm_file); +	struct super_block *sb = inode->i_sb;  	bool write = vmf->flags & FAULT_FLAG_WRITE;  	if (write) {  		sb_start_pagefault(sb);  		file_update_time(vma->vm_file); +		down_read(&EXT4_I(inode)->i_mmap_sem);  		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,  						EXT4_DATA_TRANS_BLOCKS(sb)); -	} +	} else +		down_read(&EXT4_I(inode)->i_mmap_sem);  	if (IS_ERR(handle))  		result = VM_FAULT_SIGBUS;  	else -		result = __dax_fault(vma, vmf, ext4_get_block_dax, -						ext4_end_io_unwritten); +		result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);  	if (write) {  		if (!IS_ERR(handle))  			ext4_journal_stop(handle); +		up_read(&EXT4_I(inode)->i_mmap_sem);  		sb_end_pagefault(sb); -	} +	} else +		up_read(&EXT4_I(inode)->i_mmap_sem);  	return result;  } @@ -246,44 +238,86 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,  	if (write) {  		sb_start_pagefault(sb);  		file_update_time(vma->vm_file); +		down_read(&EXT4_I(inode)->i_mmap_sem);  		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,  				ext4_chunk_trans_blocks(inode,  							PMD_SIZE / PAGE_SIZE)); -	} +	} else +		down_read(&EXT4_I(inode)->i_mmap_sem);  	if (IS_ERR(handle))  		result = VM_FAULT_SIGBUS;  	else  		result = __dax_pmd_fault(vma, addr, pmd, flags, -				ext4_get_block_dax, ext4_end_io_unwritten); +				ext4_dax_mmap_get_block, NULL);  	if (write) {  		if (!IS_ERR(handle))  			ext4_journal_stop(handle); +		up_read(&EXT4_I(inode)->i_mmap_sem);  		sb_end_pagefault(sb); -	} +	} else +		up_read(&EXT4_I(inode)->i_mmap_sem);  	return result;  }  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  { -	return dax_mkwrite(vma, vmf, ext4_get_block_dax, -				ext4_end_io_unwritten); +	int err; +	struct inode *inode = file_inode(vma->vm_file); + +	sb_start_pagefault(inode->i_sb); +	file_update_time(vma->vm_file); +	down_read(&EXT4_I(inode)->i_mmap_sem); +	err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); +	up_read(&EXT4_I(inode)->i_mmap_sem); +	sb_end_pagefault(inode->i_sb); + +	return err; +} + +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, +				struct vm_fault *vmf) +{ +	struct inode *inode = file_inode(vma->vm_file); +	struct super_block *sb = inode->i_sb; +	int ret = VM_FAULT_NOPAGE; +	loff_t size; + +	sb_start_pagefault(sb); +	file_update_time(vma->vm_file); +	down_read(&EXT4_I(inode)->i_mmap_sem); +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; +	if (vmf->pgoff >= size) +		ret = VM_FAULT_SIGBUS; +	up_read(&EXT4_I(inode)->i_mmap_sem); +	sb_end_pagefault(sb); + +	return ret;  }  static const struct vm_operations_struct ext4_dax_vm_ops = {  	.fault		= ext4_dax_fault,  	.pmd_fault	= ext4_dax_pmd_fault,  	.page_mkwrite	= ext4_dax_mkwrite, -	.pfn_mkwrite	= dax_pfn_mkwrite, +	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,  };  #else  #define ext4_dax_vm_ops	ext4_file_vm_ops  #endif  static const struct vm_operations_struct ext4_file_vm_ops = { -	.fault		= filemap_fault, +	.fault		= ext4_filemap_fault,  	.map_pages	= filemap_map_pages,  	.page_mkwrite   = ext4_page_mkwrite,  }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1b8024d26f65..3fcfd50a2e8a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,  		inode->i_gid = dir->i_gid;  	} else  		inode_init_owner(inode, dir, mode); + +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && +	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) +		ei->i_projid = EXT4_I(dir)->i_projid; +	else +		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); +  	err = dquot_initialize(inode);  	if (err)  		goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..dfe3b9bafc0d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,   */  static int ext4_add_dirent_to_inline(handle_t *handle,  				     struct ext4_filename *fname, -				     struct dentry *dentry, +				     struct inode *dir,  				     struct inode *inode,  				     struct ext4_iloc *iloc,  				     void *inline_start, int inline_size)  { -	struct inode	*dir = d_inode(dentry->d_parent);  	int		err;  	struct ext4_dir_entry_2 *de; @@ -1245,12 +1244,11 @@ out:   * the new created block.   */  int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, -			      struct dentry *dentry, struct inode *inode) +			      struct inode *dir, struct inode *inode)  {  	int ret, inline_size;  	void *inline_start;  	struct ext4_iloc iloc; -	struct inode *dir = d_inode(dentry->d_parent);  	ret = ext4_get_inode_loc(dir, &iloc);  	if (ret) @@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,  						 EXT4_INLINE_DOTDOT_SIZE;  	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; -	ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, +	ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,  					inline_start, inline_size);  	if (ret != -ENOSPC)  		goto out; @@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,  	if (inline_size) {  		inline_start = ext4_get_inline_xattr_pos(dir, &iloc); -		ret = ext4_add_dirent_to_inline(handle, fname, dentry, +		ret = ext4_add_dirent_to_inline(handle, fname, dir,  						inode, &iloc, inline_start,  						inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3bd912df6bf..d964195ea0e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func,  	return 0;  } +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, +		       ext4_lblk_t len) +{ +	int ret; + +	if (ext4_encrypted_inode(inode)) +		return ext4_encrypted_zeroout(inode, lblk, pblk, len); + +	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); +	if (ret > 0) +		ret = 0; + +	return ret; +} +  #define check_block_validity(inode, map)	\  	__check_block_validity((inode), __func__, __LINE__, (map)) @@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,  	 * out taking i_data_sem.  So at the time the unwritten extent  	 * could be converted.  	 */ -	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read(&EXT4_I(inode)->i_data_sem); +	down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,  		retval = ext4_ind_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE);  	} -	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		up_read((&EXT4_I(inode)->i_data_sem)); +	up_read((&EXT4_I(inode)->i_data_sem));  	/*  	 * We don't check m_len because extent will be collpased in status @@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  	 * Try to see if we can get the block without requesting a new  	 * file system block.  	 */ -	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		down_read(&EXT4_I(inode)->i_data_sem); +	down_read(&EXT4_I(inode)->i_data_sem);  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		retval = ext4_ext_map_blocks(handle, inode, map, flags &  					     EXT4_GET_BLOCKS_KEEP_SIZE); @@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,  		if (ret < 0)  			retval = ret;  	} -	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) -		up_read((&EXT4_I(inode)->i_data_sem)); +	up_read((&EXT4_I(inode)->i_data_sem));  found:  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -626,13 +637,29 @@ found:  		}  		/* +		 * We have to zeroout blocks before inserting them into extent +		 * status tree. Otherwise someone could look them up there and +		 * use them before they are really zeroed. +		 */ +		if (flags & EXT4_GET_BLOCKS_ZERO && +		    map->m_flags & EXT4_MAP_MAPPED && +		    map->m_flags & EXT4_MAP_NEW) { +			ret = ext4_issue_zeroout(inode, map->m_lblk, +						 map->m_pblk, map->m_len); +			if (ret) { +				retval = ret; +				goto out_sem; +			} +		} + +		/*  		 * If the extent has been zeroed out, we don't need to update  		 * extent status tree.  		 */  		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&  		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {  			if (ext4_es_is_written(&es)) -				goto has_zeroout; +				goto out_sem;  		}  		status = map->m_flags & EXT4_MAP_UNWRITTEN ?  				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -643,11 +670,13 @@ found:  			status |= EXTENT_STATUS_DELAYED;  		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,  					    map->m_pblk, status); -		if (ret < 0) +		if (ret < 0) {  			retval = ret; +			goto out_sem; +		}  	} -has_zeroout: +out_sem:  	up_write((&EXT4_I(inode)->i_data_sem));  	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {  		ret = check_block_validity(inode, map); @@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,  	map.m_lblk = iblock;  	map.m_len = bh->b_size >> inode->i_blkbits; -	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { +	if (flags && !handle) {  		/* Direct IO write... */  		if (map.m_len > DIO_MAX_BLOCKS)  			map.m_len = DIO_MAX_BLOCKS; @@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,  		map_bh(bh, inode->i_sb, map.m_pblk);  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; -		if (IS_DAX(inode) && buffer_unwritten(bh)) { -			/* -			 * dgc: I suspect unwritten conversion on ext4+DAX is -			 * fundamentally broken here when there are concurrent -			 * read/write in progress on this inode. -			 */ -			WARN_ON_ONCE(io_end); -			bh->b_assoc_map = inode->i_mapping; -			bh->b_private = (void *)(unsigned long)iblock; -		}  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)  			set_buffer_defer_completion(bh);  		bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle,  	return ret;  } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, -		   struct buffer_head *bh_result, int create); -  #ifdef CONFIG_EXT4_FS_ENCRYPTION  static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,  				  get_block_t *get_block) @@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,  			       EXT4_GET_BLOCKS_IO_CREATE_EXT);  } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, +static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,  		   struct buffer_head *bh_result, int create)  { -	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", +	int ret; + +	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",  		   inode->i_ino, create); -	return _ext4_get_block(inode, iblock, bh_result, -			       EXT4_GET_BLOCKS_NO_LOCK); +	ret = _ext4_get_block(inode, iblock, bh_result, 0); +	/* +	 * Blocks should have been preallocated! ext4_file_write_iter() checks +	 * that. +	 */ +	WARN_ON_ONCE(!buffer_mapped(bh_result)); + +	return ret;  } -int ext4_get_block_dax(struct inode *inode, sector_t iblock, -		   struct buffer_head *bh_result, int create) +#ifdef CONFIG_FS_DAX +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, +			    struct buffer_head *bh_result, int create)  { -	int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; -	if (create) -		flags |= EXT4_GET_BLOCKS_CREATE; -	ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", +	int ret, err; +	int credits; +	struct ext4_map_blocks map; +	handle_t *handle = NULL; +	int flags = 0; + +	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",  		   inode->i_ino, create); -	return _ext4_get_block(inode, iblock, bh_result, flags); +	map.m_lblk = iblock; +	map.m_len = bh_result->b_size >> inode->i_blkbits; +	credits = ext4_chunk_trans_blocks(inode, map.m_len); +	if (create) { +		flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			return ret; +		} +	} + +	ret = ext4_map_blocks(handle, inode, &map, flags); +	if (create) { +		err = ext4_journal_stop(handle); +		if (ret >= 0 && err < 0) +			ret = err; +	} +	if (ret <= 0) +		goto out; +	if (map.m_flags & EXT4_MAP_UNWRITTEN) { +		int err2; + +		/* +		 * We are protected by i_mmap_sem so we know block cannot go +		 * away from under us even though we dropped i_data_sem. +		 * Convert extent to written and write zeros there. +		 * +		 * Note: We may get here even when create == 0. +		 */ +		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); +		if (IS_ERR(handle)) { +			ret = PTR_ERR(handle); +			goto out; +		} + +		err = ext4_map_blocks(handle, inode, &map, +		      EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); +		if (err < 0) +			ret = err; +		err2 = ext4_journal_stop(handle); +		if (err2 < 0 && ret > 0) +			ret = err2; +	} +out: +	WARN_ON_ONCE(ret == 0 && create); +	if (ret > 0) { +		map_bh(bh_result, inode->i_sb, map.m_pblk); +		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | +					map.m_flags; +		/* +		 * At least for now we have to clear BH_New so that DAX code +		 * doesn't attempt to zero blocks again in a racy way. +		 */ +		bh_result->b_state &= ~(1 << BH_New); +		bh_result->b_size = map.m_len << inode->i_blkbits; +		ret = 0; +	} +	return ret;  } +#endif  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  			    ssize_t size, void *private) @@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  	/* If we do a overwrite dio, i_mutex locking can be released */  	overwrite = *((int *)iocb->private); -	if (overwrite) { -		down_read(&EXT4_I(inode)->i_data_sem); +	if (overwrite)  		mutex_unlock(&inode->i_mutex); -	}  	/*  	 * We could direct write to holes and fallocate. @@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  	}  	if (overwrite) { -		get_block_func = ext4_get_block_write_nolock; +		get_block_func = ext4_get_block_overwrite;  	} else {  		get_block_func = ext4_get_block_write;  		dio_flags = DIO_LOCKING; @@ -3245,10 +3330,8 @@ retake_lock:  	if (iov_iter_rw(iter) == WRITE)  		inode_dio_end(inode);  	/* take i_mutex locking again if we do a ovewrite dio */ -	if (overwrite) { -		up_read(&EXT4_I(inode)->i_data_sem); +	if (overwrite)  		mutex_lock(&inode->i_mutex); -	}  	return ret;  } @@ -3559,6 +3642,35 @@ int ext4_can_truncate(struct inode *inode)  }  /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, +				      loff_t len) +{ +	handle_t *handle; +	loff_t size = i_size_read(inode); + +	WARN_ON(!mutex_is_locked(&inode->i_mutex)); +	if (offset > size || offset + len < size) +		return 0; + +	if (EXT4_I(inode)->i_disksize >= size) +		return 0; + +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); +	if (IS_ERR(handle)) +		return PTR_ERR(handle); +	ext4_update_i_disksize(inode, size); +	ext4_mark_inode_dirty(handle, inode); +	ext4_journal_stop(handle); + +	return 0; +} + +/*   * ext4_punch_hole: punches a hole in a file by releaseing the blocks   * associated with the given offset and length   * @@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	} +	/* Wait all existing dio workers, newcomers will block on i_mutex */ +	ext4_inode_block_unlocked_dio(inode); +	inode_dio_wait(inode); + +	/* +	 * Prevent page faults from reinstantiating pages we have released from +	 * page cache. +	 */ +	down_write(&EXT4_I(inode)->i_mmap_sem);  	first_block_offset = round_up(offset, sb->s_blocksize);  	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;  	/* Now release the pages and zero block aligned part of pages*/ -	if (last_block_offset > first_block_offset) +	if (last_block_offset > first_block_offset) { +		ret = ext4_update_disksize_before_punch(inode, offset, length); +		if (ret) +			goto out_dio;  		truncate_pagecache_range(inode, first_block_offset,  					 last_block_offset); - -	/* Wait all existing dio workers, newcomers will block on i_mutex */ -	ext4_inode_block_unlocked_dio(inode); -	inode_dio_wait(inode); +	}  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))  		credits = ext4_writepage_trans_blocks(inode); @@ -3680,16 +3801,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  	if (IS_SYNC(inode))  		ext4_handle_sync(handle); -	/* Now release the pages again to reduce race window */ -	if (last_block_offset > first_block_offset) -		truncate_pagecache_range(inode, first_block_offset, -					 last_block_offset); -  	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);  	ext4_mark_inode_dirty(handle, inode);  out_stop:  	ext4_journal_stop(handle);  out_dio: +	up_write(&EXT4_I(inode)->i_mmap_sem);  	ext4_inode_resume_unlocked_dio(inode);  out_mutex:  	mutex_unlock(&inode->i_mutex); @@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,  		EXT4_I(inode)->i_inline_off = 0;  } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ +	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) +		return -EOPNOTSUPP; +	*projid = EXT4_I(inode)->i_projid; +	return 0; +} +  struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  {  	struct ext4_iloc iloc; @@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	int block;  	uid_t i_uid;  	gid_t i_gid; +	projid_t i_projid;  	inode = iget_locked(sb, ino);  	if (!inode) @@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)  	inode->i_mode = le16_to_cpu(raw_inode->i_mode);  	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);  	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && +	    EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && +	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) +		i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); +	else +		i_projid = EXT4_DEF_PROJID; +  	if (!(test_opt(inode->i_sb, NO_UID32))) {  		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;  		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;  	}  	i_uid_write(inode, i_uid);  	i_gid_write(inode, i_gid); +	ei->i_projid = make_kprojid(&init_user_ns, i_projid);  	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));  	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */ @@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle,  	int need_datasync = 0, set_large_file = 0;  	uid_t i_uid;  	gid_t i_gid; +	projid_t i_projid;  	spin_lock(&ei->i_raw_lock); @@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle,  	raw_inode->i_mode = cpu_to_le16(inode->i_mode);  	i_uid = i_uid_read(inode);  	i_gid = i_gid_read(inode); +	i_projid = from_kprojid(&init_user_ns, ei->i_projid);  	if (!(test_opt(inode->i_sb, NO_UID32))) {  		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));  		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle,  				cpu_to_le16(ei->i_extra_isize);  		}  	} + +	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, +			EXT4_FEATURE_RO_COMPAT_PROJECT) && +	       i_projid != EXT4_DEF_PROJID); + +	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && +	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) +		raw_inode->i_projid = cpu_to_le32(i_projid); +  	ext4_inode_csum_set(inode, raw_inode, ei);  	spin_unlock(&ei->i_raw_lock);  	if (inode->i_sb->s_flags & MS_LAZYTIME) @@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  			} else  				ext4_wait_for_tail_page_commit(inode);  		} +		down_write(&EXT4_I(inode)->i_mmap_sem);  		/*  		 * Truncate pagecache after we've waited for commit  		 * in data=journal mode to make pages freeable. @@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  		truncate_pagecache(inode, inode->i_size);  		if (shrink)  			ext4_truncate(inode); +		up_write(&EXT4_I(inode)->i_mmap_sem);  	}  	if (!rc) { @@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	sb_start_pagefault(inode->i_sb);  	file_update_time(vma->vm_file); + +	down_read(&EXT4_I(inode)->i_mmap_sem);  	/* Delalloc case is easy... */  	if (test_opt(inode->i_sb, DELALLOC) &&  	    !ext4_should_journal_data(inode) && @@ -5348,6 +5497,19 @@ retry_alloc:  out_ret:  	ret = block_page_mkwrite_return(ret);  out: +	up_read(&EXT4_I(inode)->i_mmap_sem);  	sb_end_pagefault(inode->i_sb);  	return ret;  } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct inode *inode = file_inode(vma->vm_file); +	int err; + +	down_read(&EXT4_I(inode)->i_mmap_sem); +	err = filemap_fault(vma, vmf); +	up_read(&EXT4_I(inode)->i_mmap_sem); + +	return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40e5e..2b0cb84255eb 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@  #include <linux/mount.h>  #include <linux/file.h>  #include <linux/random.h> +#include <linux/quotaops.h>  #include <asm/uaccess.h>  #include "ext4_jbd2.h"  #include "ext4.h" @@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])  	return 1;  } +static int ext4_ioctl_setflags(struct inode *inode, +			       unsigned int flags) +{ +	struct ext4_inode_info *ei = EXT4_I(inode); +	handle_t *handle = NULL; +	int err = EPERM, migrate = 0; +	struct ext4_iloc iloc; +	unsigned int oldflags, mask, i; +	unsigned int jflag; + +	/* Is it quota file? Do not allow user to mess with it */ +	if (IS_NOQUOTA(inode)) +		goto flags_out; + +	oldflags = ei->i_flags; + +	/* The JOURNAL_DATA flag is modifiable only by root */ +	jflag = flags & EXT4_JOURNAL_DATA_FL; + +	/* +	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by +	 * the relevant capability. +	 * +	 * This test looks nicer. Thanks to Pauline Middelink +	 */ +	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { +		if (!capable(CAP_LINUX_IMMUTABLE)) +			goto flags_out; +	} + +	/* +	 * The JOURNAL_DATA flag can only be changed by +	 * the relevant capability. +	 */ +	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { +		if (!capable(CAP_SYS_RESOURCE)) +			goto flags_out; +	} +	if ((flags ^ oldflags) & EXT4_EXTENTS_FL) +		migrate = 1; + +	if (flags & EXT4_EOFBLOCKS_FL) { +		/* we don't support adding EOFBLOCKS flag */ +		if (!(oldflags & EXT4_EOFBLOCKS_FL)) { +			err = -EOPNOTSUPP; +			goto flags_out; +		} +	} else if (oldflags & EXT4_EOFBLOCKS_FL) +		ext4_truncate(inode); + +	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); +	if (IS_ERR(handle)) { +		err = PTR_ERR(handle); +		goto flags_out; +	} +	if (IS_SYNC(inode)) +		ext4_handle_sync(handle); +	err = ext4_reserve_inode_write(handle, inode, &iloc); +	if (err) +		goto flags_err; + +	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { +		if (!(mask & EXT4_FL_USER_MODIFIABLE)) +			continue; +		if (mask & flags) +			ext4_set_inode_flag(inode, i); +		else +			ext4_clear_inode_flag(inode, i); +	} + +	ext4_set_inode_flags(inode); +	inode->i_ctime = ext4_current_time(inode); + +	err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: +	ext4_journal_stop(handle); +	if (err) +		goto flags_out; + +	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) +		err = ext4_change_inode_journal_flag(inode, jflag); +	if (err) +		goto flags_out; +	if (migrate) { +		if (flags & EXT4_EXTENTS_FL) +			err = ext4_ext_migrate(inode); +		else +			err = ext4_ind_migrate(inode); +	} + +flags_out: +	return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ +	struct inode *inode = file_inode(filp); +	struct super_block *sb = inode->i_sb; +	struct ext4_inode_info *ei = EXT4_I(inode); +	int err, rc; +	handle_t *handle; +	kprojid_t kprojid; +	struct ext4_iloc iloc; +	struct ext4_inode *raw_inode; + +	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, +			EXT4_FEATURE_RO_COMPAT_PROJECT)) { +		if (projid != EXT4_DEF_PROJID) +			return -EOPNOTSUPP; +		else +			return 0; +	} + +	if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) +		return -EOPNOTSUPP; + +	kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + +	if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) +		return 0; + +	err = mnt_want_write_file(filp); +	if (err) +		return err; + +	err = -EPERM; +	mutex_lock(&inode->i_mutex); +	/* Is it quota file? Do not allow user to mess with it */ +	if (IS_NOQUOTA(inode)) +		goto out_unlock; + +	err = ext4_get_inode_loc(inode, &iloc); +	if (err) +		goto out_unlock; + +	raw_inode = ext4_raw_inode(&iloc); +	if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { +		err = -EOVERFLOW; +		brelse(iloc.bh); +		goto out_unlock; +	} +	brelse(iloc.bh); + +	dquot_initialize(inode); + +	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, +		EXT4_QUOTA_INIT_BLOCKS(sb) + +		EXT4_QUOTA_DEL_BLOCKS(sb) + 3); +	if (IS_ERR(handle)) { +		err = PTR_ERR(handle); +		goto out_unlock; +	} + +	err = ext4_reserve_inode_write(handle, inode, &iloc); +	if (err) +		goto out_stop; + +	if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { +		struct dquot *transfer_to[MAXQUOTAS] = { }; + +		transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); +		if (transfer_to[PRJQUOTA]) { +			err = __dquot_transfer(inode, transfer_to); +			dqput(transfer_to[PRJQUOTA]); +			if (err) +				goto out_dirty; +		} +	} +	EXT4_I(inode)->i_projid = kprojid; +	inode->i_ctime = ext4_current_time(inode); +out_dirty: +	rc = ext4_mark_iloc_dirty(handle, inode, &iloc); +	if (!err) +		err = rc; +out_stop: +	ext4_journal_stop(handle); +out_unlock: +	mutex_unlock(&inode->i_mutex); +	mnt_drop_write_file(filp); +	return err; +} +#else +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ +	if (projid != EXT4_DEF_PROJID) +		return -EOPNOTSUPP; +	return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) +{ +	__u32 xflags = 0; + +	if (iflags & EXT4_SYNC_FL) +		xflags |= FS_XFLAG_SYNC; +	if (iflags & EXT4_IMMUTABLE_FL) +		xflags |= FS_XFLAG_IMMUTABLE; +	if (iflags & EXT4_APPEND_FL) +		xflags |= FS_XFLAG_APPEND; +	if (iflags & EXT4_NODUMP_FL) +		xflags |= FS_XFLAG_NODUMP; +	if (iflags & EXT4_NOATIME_FL) +		xflags |= FS_XFLAG_NOATIME; +	if (iflags & EXT4_PROJINHERIT_FL) +		xflags |= FS_XFLAG_PROJINHERIT; +	return xflags; +} + +/* Transfer xflags flags to internal */ +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) +{ +	unsigned long iflags = 0; + +	if (xflags & FS_XFLAG_SYNC) +		iflags |= EXT4_SYNC_FL; +	if (xflags & FS_XFLAG_IMMUTABLE) +		iflags |= EXT4_IMMUTABLE_FL; +	if (xflags & FS_XFLAG_APPEND) +		iflags |= EXT4_APPEND_FL; +	if (xflags & FS_XFLAG_NODUMP) +		iflags |= EXT4_NODUMP_FL; +	if (xflags & FS_XFLAG_NOATIME) +		iflags |= EXT4_NOATIME_FL; +	if (xflags & FS_XFLAG_PROJINHERIT) +		iflags |= EXT4_PROJINHERIT_FL; + +	return iflags; +} +  long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  {  	struct inode *inode = file_inode(filp); @@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;  		return put_user(flags, (int __user *) arg);  	case EXT4_IOC_SETFLAGS: { -		handle_t *handle = NULL; -		int err, migrate = 0; -		struct ext4_iloc iloc; -		unsigned int oldflags, mask, i; -		unsigned int jflag; +		int err;  		if (!inode_owner_or_capable(inode))  			return -EACCES; @@ -235,89 +464,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		flags = ext4_mask_flags(inode->i_mode, flags); -		err = -EPERM;  		mutex_lock(&inode->i_mutex); -		/* Is it quota file? Do not allow user to mess with it */ -		if (IS_NOQUOTA(inode)) -			goto flags_out; - -		oldflags = ei->i_flags; - -		/* The JOURNAL_DATA flag is modifiable only by root */ -		jflag = flags & EXT4_JOURNAL_DATA_FL; - -		/* -		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by -		 * the relevant capability. -		 * -		 * This test looks nicer. Thanks to Pauline Middelink -		 */ -		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { -			if (!capable(CAP_LINUX_IMMUTABLE)) -				goto flags_out; -		} - -		/* -		 * The JOURNAL_DATA flag can only be changed by -		 * the relevant capability. -		 */ -		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { -			if (!capable(CAP_SYS_RESOURCE)) -				goto flags_out; -		} -		if ((flags ^ oldflags) & EXT4_EXTENTS_FL) -			migrate = 1; - -		if (flags & EXT4_EOFBLOCKS_FL) { -			/* we don't support adding EOFBLOCKS flag */ -			if (!(oldflags & EXT4_EOFBLOCKS_FL)) { -				err = -EOPNOTSUPP; -				goto flags_out; -			} -		} else if (oldflags & EXT4_EOFBLOCKS_FL) -			ext4_truncate(inode); - -		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); -		if (IS_ERR(handle)) { -			err = PTR_ERR(handle); -			goto flags_out; -		} -		if (IS_SYNC(inode)) -			ext4_handle_sync(handle); -		err = ext4_reserve_inode_write(handle, inode, &iloc); -		if (err) -			goto flags_err; - -		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { -			if (!(mask & EXT4_FL_USER_MODIFIABLE)) -				continue; -			if (mask & flags) -				ext4_set_inode_flag(inode, i); -			else -				ext4_clear_inode_flag(inode, i); -		} - -		ext4_set_inode_flags(inode); -		inode->i_ctime = ext4_current_time(inode); - -		err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: -		ext4_journal_stop(handle); -		if (err) -			goto flags_out; - -		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) -			err = ext4_change_inode_journal_flag(inode, jflag); -		if (err) -			goto flags_out; -		if (migrate) { -			if (flags & EXT4_EXTENTS_FL) -				err = ext4_ext_migrate(inode); -			else -				err = ext4_ind_migrate(inode); -		} - -flags_out: +		err = ext4_ioctl_setflags(inode, flags);  		mutex_unlock(&inode->i_mutex);  		mnt_drop_write_file(filp);  		return err; @@ -689,6 +837,60 @@ encryption_policy_out:  		return -EOPNOTSUPP;  #endif  	} +	case EXT4_IOC_FSGETXATTR: +	{ +		struct fsxattr fa; + +		memset(&fa, 0, sizeof(struct fsxattr)); +		ext4_get_inode_flags(ei); +		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); + +		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, +				EXT4_FEATURE_RO_COMPAT_PROJECT)) { +			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, +				EXT4_I(inode)->i_projid); +		} + +		if (copy_to_user((struct fsxattr __user *)arg, +				 &fa, sizeof(fa))) +			return -EFAULT; +		return 0; +	} +	case EXT4_IOC_FSSETXATTR: +	{ +		struct fsxattr fa; +		int err; + +		if (copy_from_user(&fa, (struct fsxattr __user *)arg, +				   sizeof(fa))) +			return -EFAULT; + +		/* Make sure caller has proper permission */ +		if (!inode_owner_or_capable(inode)) +			return -EACCES; + +		err = mnt_want_write_file(filp); +		if (err) +			return err; + +		flags = ext4_xflags_to_iflags(fa.fsx_xflags); +		flags = ext4_mask_flags(inode->i_mode, flags); + +		mutex_lock(&inode->i_mutex); +		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | +			 (flags & EXT4_FL_XFLAG_VISIBLE); +		err = ext4_ioctl_setflags(inode, flags); +		mutex_unlock(&inode->i_mutex); +		mnt_drop_write_file(filp); +		if (err) +			return err; + +		err = ext4_ioctl_setproject(filp, fa.fsx_projid); +		if (err) +			return err; + +		return 0; +	}  	default:  		return -ENOTTY;  	} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f27e0c2598c5..854f75de4599 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,  		struct ext4_filename *fname,  		struct ext4_dir_entry_2 **res_dir);  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, -			     struct dentry *dentry, struct inode *inode); +			     struct inode *dir, struct inode *inode);  /* checksumming functions */  void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,   * directory, and adds the dentry to the indexed directory.   */  static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, -			    struct dentry *dentry, +			    struct inode *dir,  			    struct inode *inode, struct buffer_head *bh)  { -	struct inode	*dir = d_inode(dentry->d_parent);  	struct buffer_head *bh2;  	struct dx_root	*root;  	struct dx_frame	frames[2], *frame; @@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  		return retval;  	if (ext4_has_inline_data(dir)) { -		retval = ext4_try_add_inline_entry(handle, &fname, -						   dentry, inode); +		retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);  		if (retval < 0)  			goto out;  		if (retval == 1) { @@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  	}  	if (is_dx(dir)) { -		retval = ext4_dx_add_entry(handle, &fname, dentry, inode); +		retval = ext4_dx_add_entry(handle, &fname, dir, inode);  		if (!retval || (retval != ERR_BAD_DX_DIR))  			goto out;  		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  		if (blocks == 1 && !dx_fallback &&  		    ext4_has_feature_dir_index(sb)) { -			retval = make_indexed_dir(handle, &fname, dentry, +			retval = make_indexed_dir(handle, &fname, dir,  						  inode, bh);  			bh = NULL; /* make_indexed_dir releases bh */  			goto out; @@ -2154,12 +2152,11 @@ out:   * Returns 0 for success, or a negative error value   */  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, -			     struct dentry *dentry, struct inode *inode) +			     struct inode *dir, struct inode *inode)  {  	struct dx_frame frames[2], *frame;  	struct dx_entry *entries, *at;  	struct buffer_head *bh; -	struct inode *dir = d_inode(dentry->d_parent);  	struct super_block *sb = dir->i_sb;  	struct ext4_dir_entry_2 *de;  	int err; @@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry,  	if (ext4_encrypted_inode(dir) &&  	    !ext4_is_child_context_consistent_with_parent(dir, inode))  		return -EPERM; + +       if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && +	   (!projid_eq(EXT4_I(dir)->i_projid, +		       EXT4_I(old_dentry->d_inode)->i_projid))) +		return -EXDEV; +  	err = dquot_initialize(dir);  	if (err)  		return err; @@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,  	int credits;  	u8 old_file_type; +	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && +	    (!projid_eq(EXT4_I(new_dir)->i_projid, +			EXT4_I(old_dentry->d_inode)->i_projid))) +		return -EXDEV; +  	retval = dquot_initialize(old.dir);  	if (retval)  		return retval; @@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,  							   new.inode)))  		return -EPERM; +	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && +	     !projid_eq(EXT4_I(new_dir)->i_projid, +			EXT4_I(old_dentry->d_inode)->i_projid)) || +	    (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && +	     !projid_eq(EXT4_I(old_dir)->i_projid, +			EXT4_I(new_dentry->d_inode)->i_projid))) +		return -EXDEV; +  	retval = dquot_initialize(old.dir);  	if (retval)  		return retval; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f1b56ff01208..00c98fab6333 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void);  static void ext4_unregister_li_request(struct super_block *sb);  static void ext4_clear_request_list(void); +/* + * Lock ordering + * + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and + * i_mmap_rwsem (inode->i_mmap_rwsem)! + * + * page fault path: + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + *   page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> page lock -> + *   i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + *   i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + *   transaction start -> i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> + *   transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ +  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)  static struct file_system_type ext2_fs_type = {  	.owner		= THIS_MODULE, @@ -958,6 +988,7 @@ static void init_once(void *foo)  	INIT_LIST_HEAD(&ei->i_orphan);  	init_rwsem(&ei->xattr_sem);  	init_rwsem(&ei->i_data_sem); +	init_rwsem(&ei->i_mmap_sem);  	inode_init_once(&ei->vfs_inode);  } @@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,  }  #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t])  static int ext4_write_dquot(struct dquot *dquot);  static int ext4_acquire_dquot(struct dquot *dquot); @@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = {  	.write_info	= ext4_write_info,  	.alloc_dquot	= dquot_alloc,  	.destroy_dquot	= dquot_destroy, +	.get_projid	= ext4_get_projid,  };  static const struct quotactl_ops ext4_qctl_operations = { @@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)  			 "without CONFIG_QUOTA");  		return 0;  	} +	if (ext4_has_feature_project(sb) && !readonly) { +		ext4_msg(sb, KERN_ERR, +			 "Filesystem with project quota feature cannot be mounted RDWR " +			 "without CONFIG_QUOTA"); +		return 0; +	}  #endif  /* CONFIG_QUOTA */  	return 1;  } @@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		sb->s_qcop = &dquot_quotactl_sysfile_ops;  	else  		sb->s_qcop = &ext4_qctl_operations; -	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;  #endif  	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4790,6 +4828,48 @@ restore_opts:  	return err;  } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, +			       kprojid_t projid, struct kstatfs *buf) +{ +	struct kqid qid; +	struct dquot *dquot; +	u64 limit; +	u64 curblock; + +	qid = make_kqid_projid(projid); +	dquot = dqget(sb, qid); +	if (IS_ERR(dquot)) +		return PTR_ERR(dquot); +	spin_lock(&dq_data_lock); + +	limit = (dquot->dq_dqb.dqb_bsoftlimit ? +		 dquot->dq_dqb.dqb_bsoftlimit : +		 dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; +	if (limit && buf->f_blocks > limit) { +		curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; +		buf->f_blocks = limit; +		buf->f_bfree = buf->f_bavail = +			(buf->f_blocks > curblock) ? +			 (buf->f_blocks - curblock) : 0; +	} + +	limit = dquot->dq_dqb.dqb_isoftlimit ? +		dquot->dq_dqb.dqb_isoftlimit : +		dquot->dq_dqb.dqb_ihardlimit; +	if (limit && buf->f_files > limit) { +		buf->f_files = limit; +		buf->f_ffree = +			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ? +			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; +	} + +	spin_unlock(&dq_data_lock); +	dqput(dquot); +	return 0; +} +#endif +  static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)  {  	struct super_block *sb = dentry->d_sb; @@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)  	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;  	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA +	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && +	    sb_has_quota_limits_enabled(sb, PRJQUOTA)) +		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif  	return 0;  } @@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,  	struct inode *qf_inode;  	unsigned long qf_inums[EXT4_MAXQUOTAS] = {  		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), -		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) +		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), +		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)  	};  	BUG_ON(!ext4_has_feature_quota(sb)); @@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb)  	int type, err = 0;  	unsigned long qf_inums[EXT4_MAXQUOTAS] = {  		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), -		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) +		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), +		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)  	};  	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@   */  static inline void ext4_truncate_failed_write(struct inode *inode)  { +	down_write(&EXT4_I(inode)->i_mmap_sem);  	truncate_inode_pages(inode->i_mapping, inode->i_size);  	ext4_truncate(inode); +	up_write(&EXT4_I(inode)->i_mmap_sem);  }  /* diff --git a/fs/filesystems.c b/fs/filesystems.c index 5797d45a78cb..c5618db110be 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)  static struct file_system_type **find_filesystem(const char *name, unsigned len)  {  	struct file_system_type **p; -	for (p=&file_systems; *p; p=&(*p)->next) -		if (strlen((*p)->name) == len && -		    strncmp((*p)->name, name, len) == 0) +	for (p = &file_systems; *p; p = &(*p)->next) +		if (strncmp((*p)->name, name, len) == 0 && +		    !(*p)->name[len])  			break;  	return p;  } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f92612e4b9d6..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,  	unsigned int gen;  	int noqueue_attempted = 0;  	int dlm_locked = 0; +	int kick_dc = 0;  	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {  		mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ update_holders:  unlock:  	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); +	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ +	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); +  	spin_unlock_irqrestore(&lockres->l_lock, flags); +	if (kick_dc) +		ocfs2_wake_downconvert_thread(osb);  out:  	/*  	 * This is helping work around a lock inversion between the page lock diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;   */  unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; +  /*   * We use a start+len construction, which provides full use of the    * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)  	return retval;  } +static void account_pipe_buffers(struct pipe_inode_info *pipe, +                                 unsigned long old, unsigned long new) +{ +	atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ +	return pipe_user_pages_soft && +	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ +	return pipe_user_pages_hard && +	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} +  struct pipe_inode_info *alloc_pipe_info(void)  {  	struct pipe_inode_info *pipe;  	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);  	if (pipe) { -		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); +		unsigned long pipe_bufs = PIPE_DEF_BUFFERS; +		struct user_struct *user = get_current_user(); + +		if (!too_many_pipe_buffers_hard(user)) { +			if (too_many_pipe_buffers_soft(user)) +				pipe_bufs = 1; +			pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); +		} +  		if (pipe->bufs) {  			init_waitqueue_head(&pipe->wait);  			pipe->r_counter = pipe->w_counter = 1; -			pipe->buffers = PIPE_DEF_BUFFERS; +			pipe->buffers = pipe_bufs; +			pipe->user = user; +			account_pipe_buffers(pipe, 0, pipe_bufs);  			mutex_init(&pipe->mutex);  			return pipe;  		} +		free_uid(user);  		kfree(pipe);  	} @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)  {  	int i; +	account_pipe_buffers(pipe, pipe->buffers, 0); +	free_uid(pipe->user);  	for (i = 0; i < pipe->buffers; i++) {  		struct pipe_buffer *buf = pipe->bufs + i;  		if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)  			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));  	} +	account_pipe_buffers(pipe, pipe->buffers, nr_pages);  	pipe->curbuf = 0;  	kfree(pipe->bufs);  	pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)  		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {  			ret = -EPERM;  			goto out; +		} else if ((too_many_pipe_buffers_hard(pipe->user) || +			    too_many_pipe_buffers_soft(pipe->user)) && +		           !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { +			ret = -EPERM; +			goto out;  		}  		ret = pipe_set_size(pipe, nr_pages);  		break; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 71ffc91060f6..85d16c67c33e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  	pte_t *pte;  	spinlock_t *ptl; -	if (pmd_trans_huge_lock(pmd, vma, &ptl)) { +	ptl = pmd_trans_huge_lock(pmd, vma); +	if (ptl) {  		smaps_pmd_entry(pmd, addr, walk);  		spin_unlock(ptl);  		return 0; @@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,  	spinlock_t *ptl;  	struct page *page; -	if (pmd_trans_huge_lock(pmd, vma, &ptl)) { +	ptl = pmd_trans_huge_lock(pmd, vma); +	if (ptl) {  		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {  			clear_soft_dirty_pmd(vma, addr, pmd);  			goto out; @@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,  	int err = 0;  #ifdef CONFIG_TRANSPARENT_HUGEPAGE -	if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { +	ptl = pmd_trans_huge_lock(pmdp, vma); +	if (ptl) {  		u64 flags = 0, frame = 0;  		pmd_t pmd = *pmdp; @@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,  	pte_t *orig_pte;  	pte_t *pte; -	if (pmd_trans_huge_lock(pmd, vma, &ptl)) { +	ptl = pmd_trans_huge_lock(pmd, vma); +	if (ptl) {  		pte_t huge_pte = *(pte_t *)pmd;  		struct page *page; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 05db7473bcb5..c0306ec8ed7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)  		pathrelse(&path);  		inode = reiserfs_iget(s, &obj_key); -		if (!inode) { +		if (IS_ERR_OR_NULL(inode)) {  			/*  			 * the unlink almost completed, it just did not  			 * manage to remove "save" link and release objectid diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e2536bb1c760..dc97eb21af07 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)  /*   * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s.   */  #define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */  #define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */ @@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)  	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)  /* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */ +#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX) + +/*   * Inode number format:   * low inopblog bits - offset in block   * next agblklog bits - block number in ag diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998d42..fffe3d01bd9f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,40 +36,6 @@ struct dioattr {  #endif  /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. - */ -#ifndef HAVE_FSXATTR -struct fsxattr { -	__u32		fsx_xflags;	/* xflags field value (get/set) */ -	__u32		fsx_extsize;	/* extsize field value (get/set)*/ -	__u32		fsx_nextents;	/* nextents field value (get)	*/ -	__u32		fsx_projid;	/* project identifier (get/set) */ -	unsigned char	fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */ -#define XFS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */ -#define XFS_XFLAG_APPEND	0x00000010	/* all writes append */ -#define XFS_XFLAG_SYNC		0x00000020	/* all writes synchronous */ -#define XFS_XFLAG_NOATIME	0x00000040	/* do not update access time */ -#define XFS_XFLAG_NODUMP	0x00000080	/* do not include in backups */ -#define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */ -#define XFS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */ -#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/ - -/*   * Structure for XFS_IOC_GETBMAP.   * On input, fill in bmv_offset and bmv_length of the first structure   * to indicate the area of interest in the file, and bmv_entries with @@ -514,8 +480,8 @@ typedef struct xfs_swapext  #define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)  #define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)  #define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR	FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR	FS_IOC_FSSETXATTR  #define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)  #define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)  #define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index daed4bfb85b2..435c7de42e5f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1527,6 +1527,16 @@ xfs_wait_buftarg(  	LIST_HEAD(dispose);  	int loop = 0; +	/* +	 * We need to flush the buffer workqueue to ensure that all IO +	 * completion processing is 100% done. Just waiting on buffer locks is +	 * not sufficient for async IO as the reference count held over IO is +	 * not released until after the buffer lock is dropped. Hence we need to +	 * ensure here that all reference counts have been dropped before we +	 * start walking the LRU list. +	 */ +	drain_workqueue(btp->bt_mount->m_buf_workqueue); +  	/* loop until there is nothing left on the lru list. */  	while (list_lru_count(&btp->bt_lru)) {  		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae3758a90ed6..ceba1a83cacc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,60 +610,69 @@ __xfs_iflock(  STATIC uint  _xfs_dic2xflags( -	__uint16_t		di_flags) +	__uint16_t		di_flags, +	uint64_t		di_flags2, +	bool			has_attr)  {  	uint			flags = 0;  	if (di_flags & XFS_DIFLAG_ANY) {  		if (di_flags & XFS_DIFLAG_REALTIME) -			flags |= XFS_XFLAG_REALTIME; +			flags |= FS_XFLAG_REALTIME;  		if (di_flags & XFS_DIFLAG_PREALLOC) -			flags |= XFS_XFLAG_PREALLOC; +			flags |= FS_XFLAG_PREALLOC;  		if (di_flags & XFS_DIFLAG_IMMUTABLE) -			flags |= XFS_XFLAG_IMMUTABLE; +			flags |= FS_XFLAG_IMMUTABLE;  		if (di_flags & XFS_DIFLAG_APPEND) -			flags |= XFS_XFLAG_APPEND; +			flags |= FS_XFLAG_APPEND;  		if (di_flags & XFS_DIFLAG_SYNC) -			flags |= XFS_XFLAG_SYNC; +			flags |= FS_XFLAG_SYNC;  		if (di_flags & XFS_DIFLAG_NOATIME) -			flags |= XFS_XFLAG_NOATIME; +			flags |= FS_XFLAG_NOATIME;  		if (di_flags & XFS_DIFLAG_NODUMP) -			flags |= XFS_XFLAG_NODUMP; +			flags |= FS_XFLAG_NODUMP;  		if (di_flags & XFS_DIFLAG_RTINHERIT) -			flags |= XFS_XFLAG_RTINHERIT; +			flags |= FS_XFLAG_RTINHERIT;  		if (di_flags & XFS_DIFLAG_PROJINHERIT) -			flags |= XFS_XFLAG_PROJINHERIT; +			flags |= FS_XFLAG_PROJINHERIT;  		if (di_flags & XFS_DIFLAG_NOSYMLINKS) -			flags |= XFS_XFLAG_NOSYMLINKS; +			flags |= FS_XFLAG_NOSYMLINKS;  		if (di_flags & XFS_DIFLAG_EXTSIZE) -			flags |= XFS_XFLAG_EXTSIZE; +			flags |= FS_XFLAG_EXTSIZE;  		if (di_flags & XFS_DIFLAG_EXTSZINHERIT) -			flags |= XFS_XFLAG_EXTSZINHERIT; +			flags |= FS_XFLAG_EXTSZINHERIT;  		if (di_flags & XFS_DIFLAG_NODEFRAG) -			flags |= XFS_XFLAG_NODEFRAG; +			flags |= FS_XFLAG_NODEFRAG;  		if (di_flags & XFS_DIFLAG_FILESTREAM) -			flags |= XFS_XFLAG_FILESTREAM; +			flags |= FS_XFLAG_FILESTREAM;  	} +	if (di_flags2 & XFS_DIFLAG2_ANY) { +		if (di_flags2 & XFS_DIFLAG2_DAX) +			flags |= FS_XFLAG_DAX; +	} + +	if (has_attr) +		flags |= FS_XFLAG_HASATTR; +  	return flags;  }  uint  xfs_ip2xflags( -	xfs_inode_t		*ip) +	struct xfs_inode	*ip)  { -	xfs_icdinode_t		*dic = &ip->i_d; +	struct xfs_icdinode	*dic = &ip->i_d; -	return _xfs_dic2xflags(dic->di_flags) | -				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); +	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));  }  uint  xfs_dic2xflags( -	xfs_dinode_t		*dip) +	struct xfs_dinode	*dip)  { -	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | -				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); +	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), +				be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));  }  /* @@ -862,7 +871,8 @@ xfs_ialloc(  	case S_IFREG:  	case S_IFDIR:  		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { -			uint	di_flags = 0; +			uint64_t	di_flags2 = 0; +			uint		di_flags = 0;  			if (S_ISDIR(mode)) {  				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc(  				di_flags |= XFS_DIFLAG_NODEFRAG;  			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)  				di_flags |= XFS_DIFLAG_FILESTREAM; +			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) +				di_flags2 |= XFS_DIFLAG2_DAX; +  			ip->i_d.di_flags |= di_flags; +			ip->i_d.di_flags2 |= di_flags2;  		}  		/* FALLTHROUGH */  	case S_IFLNK: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738deec6d..478d04e07f95 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -859,25 +859,25 @@ xfs_merge_ioc_xflags(  	unsigned int	xflags = start;  	if (flags & FS_IMMUTABLE_FL) -		xflags |= XFS_XFLAG_IMMUTABLE; +		xflags |= FS_XFLAG_IMMUTABLE;  	else -		xflags &= ~XFS_XFLAG_IMMUTABLE; +		xflags &= ~FS_XFLAG_IMMUTABLE;  	if (flags & FS_APPEND_FL) -		xflags |= XFS_XFLAG_APPEND; +		xflags |= FS_XFLAG_APPEND;  	else -		xflags &= ~XFS_XFLAG_APPEND; +		xflags &= ~FS_XFLAG_APPEND;  	if (flags & FS_SYNC_FL) -		xflags |= XFS_XFLAG_SYNC; +		xflags |= FS_XFLAG_SYNC;  	else -		xflags &= ~XFS_XFLAG_SYNC; +		xflags &= ~FS_XFLAG_SYNC;  	if (flags & FS_NOATIME_FL) -		xflags |= XFS_XFLAG_NOATIME; +		xflags |= FS_XFLAG_NOATIME;  	else -		xflags &= ~XFS_XFLAG_NOATIME; +		xflags &= ~FS_XFLAG_NOATIME;  	if (flags & FS_NODUMP_FL) -		xflags |= XFS_XFLAG_NODUMP; +		xflags |= FS_XFLAG_NODUMP;  	else -		xflags &= ~XFS_XFLAG_NODUMP; +		xflags &= ~FS_XFLAG_NODUMP;  	return xflags;  } @@ -945,40 +945,51 @@ xfs_set_diflags(  	unsigned int		xflags)  {  	unsigned int		di_flags; +	uint64_t		di_flags2;  	/* can't set PREALLOC this way, just preserve it */  	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); -	if (xflags & XFS_XFLAG_IMMUTABLE) +	if (xflags & FS_XFLAG_IMMUTABLE)  		di_flags |= XFS_DIFLAG_IMMUTABLE; -	if (xflags & XFS_XFLAG_APPEND) +	if (xflags & FS_XFLAG_APPEND)  		di_flags |= XFS_DIFLAG_APPEND; -	if (xflags & XFS_XFLAG_SYNC) +	if (xflags & FS_XFLAG_SYNC)  		di_flags |= XFS_DIFLAG_SYNC; -	if (xflags & XFS_XFLAG_NOATIME) +	if (xflags & FS_XFLAG_NOATIME)  		di_flags |= XFS_DIFLAG_NOATIME; -	if (xflags & XFS_XFLAG_NODUMP) +	if (xflags & FS_XFLAG_NODUMP)  		di_flags |= XFS_DIFLAG_NODUMP; -	if (xflags & XFS_XFLAG_NODEFRAG) +	if (xflags & FS_XFLAG_NODEFRAG)  		di_flags |= XFS_DIFLAG_NODEFRAG; -	if (xflags & XFS_XFLAG_FILESTREAM) +	if (xflags & FS_XFLAG_FILESTREAM)  		di_flags |= XFS_DIFLAG_FILESTREAM;  	if (S_ISDIR(ip->i_d.di_mode)) { -		if (xflags & XFS_XFLAG_RTINHERIT) +		if (xflags & FS_XFLAG_RTINHERIT)  			di_flags |= XFS_DIFLAG_RTINHERIT; -		if (xflags & XFS_XFLAG_NOSYMLINKS) +		if (xflags & FS_XFLAG_NOSYMLINKS)  			di_flags |= XFS_DIFLAG_NOSYMLINKS; -		if (xflags & XFS_XFLAG_EXTSZINHERIT) +		if (xflags & FS_XFLAG_EXTSZINHERIT)  			di_flags |= XFS_DIFLAG_EXTSZINHERIT; -		if (xflags & XFS_XFLAG_PROJINHERIT) +		if (xflags & FS_XFLAG_PROJINHERIT)  			di_flags |= XFS_DIFLAG_PROJINHERIT;  	} else if (S_ISREG(ip->i_d.di_mode)) { -		if (xflags & XFS_XFLAG_REALTIME) +		if (xflags & FS_XFLAG_REALTIME)  			di_flags |= XFS_DIFLAG_REALTIME; -		if (xflags & XFS_XFLAG_EXTSIZE) +		if (xflags & FS_XFLAG_EXTSIZE)  			di_flags |= XFS_DIFLAG_EXTSIZE;  	} -  	ip->i_d.di_flags = di_flags; + +	/* diflags2 only valid for v3 inodes. */ +	if (ip->i_d.di_version < 3) +		return; + +	di_flags2 = 0; +	if (xflags & FS_XFLAG_DAX) +		di_flags2 |= XFS_DIFLAG2_DAX; + +	ip->i_d.di_flags2 = di_flags2; +  }  STATIC void @@ -988,22 +999,27 @@ xfs_diflags_to_linux(  	struct inode		*inode = VFS_I(ip);  	unsigned int		xflags = xfs_ip2xflags(ip); -	if (xflags & XFS_XFLAG_IMMUTABLE) +	if (xflags & FS_XFLAG_IMMUTABLE)  		inode->i_flags |= S_IMMUTABLE;  	else  		inode->i_flags &= ~S_IMMUTABLE; -	if (xflags & XFS_XFLAG_APPEND) +	if (xflags & FS_XFLAG_APPEND)  		inode->i_flags |= S_APPEND;  	else  		inode->i_flags &= ~S_APPEND; -	if (xflags & XFS_XFLAG_SYNC) +	if (xflags & FS_XFLAG_SYNC)  		inode->i_flags |= S_SYNC;  	else  		inode->i_flags &= ~S_SYNC; -	if (xflags & XFS_XFLAG_NOATIME) +	if (xflags & FS_XFLAG_NOATIME)  		inode->i_flags |= S_NOATIME;  	else  		inode->i_flags &= ~S_NOATIME; +	if (xflags & FS_XFLAG_DAX) +		inode->i_flags |= S_DAX; +	else +		inode->i_flags &= ~S_DAX; +  }  static int @@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags(  	/* Can't change realtime flag if any extents are allocated. */  	if ((ip->i_d.di_nextents || ip->i_delayed_blks) && -	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) +	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))  		return -EINVAL;  	/* If realtime flag is set then must have realtime device */ -	if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { +	if (fa->fsx_xflags & FS_XFLAG_REALTIME) {  		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||  		    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))  			return -EINVAL; @@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags(  	 * we have appropriate permission.  	 */  	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || -	     (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && +	     (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&  	    !capable(CAP_LINUX_IMMUTABLE))  		return -EPERM; @@ -1095,8 +1111,8 @@ out_cancel:   * extent size hint validation is somewhat cumbersome. Rules are:   *   * 1. extent size hint is only valid for directories and regular files - * 2. XFS_XFLAG_EXTSIZE is only valid for regular files - * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.   * 4. can only be changed on regular files if no extents are allocated   * 5. can be changed on directories at any time   * 6. extsize hint of 0 turns off hints, clears inode flags. @@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize(  {  	struct xfs_mount	*mp = ip->i_mount; -	if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) +	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))  		return -EINVAL; -	if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && +	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&  	    !S_ISDIR(ip->i_d.di_mode))  		return -EINVAL; @@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize(  			return -EINVAL;  		if (XFS_IS_REALTIME_INODE(ip) || -		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { +		    (fa->fsx_xflags & FS_XFLAG_REALTIME)) {  			size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;  		} else {  			size = mp->m_sb.sb_blocksize; @@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize(  		if (fa->fsx_extsize % size)  			return -EINVAL;  	} else -		fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); +		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);  	return 0;  } @@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid(  	if (xfs_get_projid(ip) != fa->fsx_projid)  		return -EINVAL; -	if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != +	if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=  	    (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))  		return -EINVAL; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 06eafafe636e..76b71a1c6c32 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags(  		inode->i_flags |= S_SYNC;  	if (flags & XFS_DIFLAG_NOATIME)  		inode->i_flags |= S_NOATIME; -	/* XXX: Also needs an on-disk per inode flag! */ -	if (ip->i_mount->m_flags & XFS_MOUNT_DAX) +	if (ip->i_mount->m_flags & XFS_MOUNT_DAX || +	    ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)  		inode->i_flags |= S_DAX;  } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index aa67339b9537..4f18fd92ca13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,7 +497,6 @@ xfsaild(  	long		tout = 0;	/* milliseconds */  	current->flags |= PF_MEMALLOC; -	set_freezable();  	while (!kthread_should_stop()) {  		if (tout && tout <= 20) | 
