diff options
Diffstat (limited to 'fs')
120 files changed, 1743 insertions, 1104 deletions
| diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 52233fa6195f..887b673f6223 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -589,7 +589,7 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)   */  void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)  { -	unsigned int debug_id = cell->debug_id; +	unsigned int debug_id;  	time64_t now, expire_delay;  	int u, a; @@ -604,6 +604,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr  	if (cell->vl_servers->nr_servers)  		expire_delay = afs_cell_gc_delay; +	debug_id = cell->debug_id;  	u = atomic_read(&cell->ref);  	a = atomic_dec_return(&cell->active);  	trace_afs_cell(debug_id, u, a, reason); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1d2e61e0ab04..9068d5578a26 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -281,8 +281,7 @@ retry:  			if (ret < 0)  				goto error; -			set_page_private(req->pages[i], 1); -			SetPagePrivate(req->pages[i]); +			attach_page_private(req->pages[i], (void *)1);  			unlock_page(req->pages[i]);  			i++;  		} else { @@ -824,6 +823,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,  				vp->cb_break_before = afs_calc_vnode_cb_break(vnode);  				vp->vnode = vnode;  				vp->put_vnode = true; +				vp->speculative = true; /* vnode not locked */  			}  		}  	} @@ -1975,8 +1975,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)  	_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); -	set_page_private(page, 0); -	ClearPagePrivate(page); +	detach_page_private(page);  	/* The directory will need reloading. */  	if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) @@ -2003,8 +2002,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset,  		afs_stat_v(dvnode, n_inval);  	/* we clean up only if the entire page is being invalidated */ -	if (offset == 0 && length == PAGE_SIZE) { -		set_page_private(page, 0); -		ClearPagePrivate(page); -	} +	if (offset == 0 && length == PAGE_SIZE) +		detach_page_private(page);  } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index b108528bf010..2ffe09abae7f 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -243,10 +243,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,  						   index, gfp);  			if (!page)  				goto error; -			if (!PagePrivate(page)) { -				set_page_private(page, 1); -				SetPagePrivate(page); -			} +			if (!PagePrivate(page)) +				attach_page_private(page, (void *)1);  			dir_page = kmap(page);  		} diff --git a/fs/afs/file.c b/fs/afs/file.c index 371d1488cc54..85f5adf21aa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = {  	.write_iter	= afs_file_write,  	.mmap		= afs_file_mmap,  	.splice_read	= generic_file_splice_read, +	.splice_write	= iter_file_splice_write,  	.fsync		= afs_fsync,  	.lock		= afs_lock,  	.flock		= afs_flock, @@ -601,6 +602,63 @@ static int afs_readpages(struct file *file, struct address_space *mapping,  }  /* + * Adjust the dirty region of the page on truncation or full invalidation, + * getting rid of the markers altogether if the region is entirely invalidated. + */ +static void afs_invalidate_dirty(struct page *page, unsigned int offset, +				 unsigned int length) +{ +	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); +	unsigned long priv; +	unsigned int f, t, end = offset + length; + +	priv = page_private(page); + +	/* we clean up only if the entire page is being invalidated */ +	if (offset == 0 && length == thp_size(page)) +		goto full_invalidate; + +	 /* If the page was dirtied by page_mkwrite(), the PTE stays writable +	  * and we don't get another notification to tell us to expand it +	  * again. +	  */ +	if (afs_is_page_dirty_mmapped(priv)) +		return; + +	/* We may need to shorten the dirty region */ +	f = afs_page_dirty_from(priv); +	t = afs_page_dirty_to(priv); + +	if (t <= offset || f >= end) +		return; /* Doesn't overlap */ + +	if (f < offset && t > end) +		return; /* Splits the dirty region - just absorb it */ + +	if (f >= offset && t <= end) +		goto undirty; + +	if (f < offset) +		t = offset; +	else +		f = end; +	if (f == t) +		goto undirty; + +	priv = afs_page_dirty(f, t); +	set_page_private(page, priv); +	trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); +	return; + +undirty: +	trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); +	clear_page_dirty_for_io(page); +full_invalidate: +	priv = (unsigned long)detach_page_private(page); +	trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); +} + +/*   * invalidate part or all of a page   * - release a page and clean up its private data if offset is 0 (indicating   *   the entire page) @@ -608,31 +666,23 @@ static int afs_readpages(struct file *file, struct address_space *mapping,  static void afs_invalidatepage(struct page *page, unsigned int offset,  			       unsigned int length)  { -	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); -	unsigned long priv; -  	_enter("{%lu},%u,%u", page->index, offset, length);  	BUG_ON(!PageLocked(page)); +#ifdef CONFIG_AFS_FSCACHE  	/* we clean up only if the entire page is being invalidated */  	if (offset == 0 && length == PAGE_SIZE) { -#ifdef CONFIG_AFS_FSCACHE  		if (PageFsCache(page)) {  			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);  			fscache_wait_on_page_write(vnode->cache, page);  			fscache_uncache_page(vnode->cache, page);  		} +	}  #endif -		if (PagePrivate(page)) { -			priv = page_private(page); -			trace_afs_page_dirty(vnode, tracepoint_string("inval"), -					     page->index, priv); -			set_page_private(page, 0); -			ClearPagePrivate(page); -		} -	} +	if (PagePrivate(page)) +		afs_invalidate_dirty(page, offset, length);  	_leave("");  } @@ -660,11 +710,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)  #endif  	if (PagePrivate(page)) { -		priv = page_private(page); +		priv = (unsigned long)detach_page_private(page);  		trace_afs_page_dirty(vnode, tracepoint_string("rel"),  				     page->index, priv); -		set_page_private(page, 0); -		ClearPagePrivate(page);  	}  	/* indicate that the page can be released */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fe8844b4bee..b0d7b892090d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -294,6 +294,13 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v  			op->flags &= ~AFS_OPERATION_DIR_CONFLICT;  		}  	} else if (vp->scb.have_status) { +		if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && +		    vp->speculative) +			/* Ignore the result of a speculative bulk status fetch +			 * if it splits around a modification op, thereby +			 * appearing to regress the data version. +			 */ +			goto out;  		afs_apply_status(op, vp);  		if (vp->scb.have_cb)  			afs_apply_callback(op, vp); @@ -305,6 +312,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v  		}  	} +out:  	write_sequnlock(&vnode->cb_lock);  	if (vp->scb.have_status) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 81b0485fd22a..0d150a29e39e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -755,6 +755,7 @@ struct afs_vnode_param {  	bool			update_ctime:1;	/* Need to update the ctime */  	bool			set_size:1;	/* Must update i_size */  	bool			op_unlinked:1;	/* True if file was unlinked by op */ +	bool			speculative:1;	/* T if speculative status fetch (no vnode lock) */  };  /* @@ -812,6 +813,7 @@ struct afs_operation {  			pgoff_t		last;		/* last page in mapping to deal with */  			unsigned	first_offset;	/* offset into mapping[first] */  			unsigned	last_to;	/* amount of mapping[last] */ +			bool		laundering;	/* Laundering page, PG_writeback not set */  		} store;  		struct {  			struct iattr	*attr; @@ -857,6 +859,62 @@ struct afs_vnode_cache_aux {  	u64			data_version;  } __packed; +/* + * We use page->private to hold the amount of the page that we've written to, + * splitting the field into two parts.  However, we need to represent a range + * 0...PAGE_SIZE, so we reduce the resolution if the size of the page + * exceeds what we can encode. + */ +#ifdef CONFIG_64BIT +#define __AFS_PAGE_PRIV_MASK	0x7fffffffUL +#define __AFS_PAGE_PRIV_SHIFT	32 +#define __AFS_PAGE_PRIV_MMAPPED	0x80000000UL +#else +#define __AFS_PAGE_PRIV_MASK	0x7fffUL +#define __AFS_PAGE_PRIV_SHIFT	16 +#define __AFS_PAGE_PRIV_MMAPPED	0x8000UL +#endif + +static inline unsigned int afs_page_dirty_resolution(void) +{ +	int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); +	return (shift > 0) ? shift : 0; +} + +static inline size_t afs_page_dirty_from(unsigned long priv) +{ +	unsigned long x = priv & __AFS_PAGE_PRIV_MASK; + +	/* The lower bound is inclusive */ +	return x << afs_page_dirty_resolution(); +} + +static inline size_t afs_page_dirty_to(unsigned long priv) +{ +	unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + +	/* The upper bound is immediately beyond the region */ +	return (x + 1) << afs_page_dirty_resolution(); +} + +static inline unsigned long afs_page_dirty(size_t from, size_t to) +{ +	unsigned int res = afs_page_dirty_resolution(); +	from >>= res; +	to = (to - 1) >> res; +	return (to << __AFS_PAGE_PRIV_SHIFT) | from; +} + +static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) +{ +	return priv | __AFS_PAGE_PRIV_MMAPPED; +} + +static inline bool afs_is_page_dirty_mmapped(unsigned long priv) +{ +	return priv & __AFS_PAGE_PRIV_MMAPPED; +} +  #include <trace/events/afs.h>  /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index da12abd6db21..c9195fc67fd8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -76,7 +76,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,   */  int afs_write_begin(struct file *file, struct address_space *mapping,  		    loff_t pos, unsigned len, unsigned flags, -		    struct page **pagep, void **fsdata) +		    struct page **_page, void **fsdata)  {  	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	struct page *page; @@ -90,11 +90,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping,  	_enter("{%llx:%llu},{%lx},%u,%u",  	       vnode->fid.vid, vnode->fid.vnode, index, from, to); -	/* We want to store information about how much of a page is altered in -	 * page->private. -	 */ -	BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8); -  	page = grab_cache_page_write_begin(mapping, index, flags);  	if (!page)  		return -ENOMEM; @@ -110,9 +105,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping,  		SetPageUptodate(page);  	} -	/* page won't leak in error case: it eventually gets cleaned off LRU */ -	*pagep = page; -  try_again:  	/* See if this page is already partially written in a way that we can  	 * merge the new write with. @@ -120,8 +112,8 @@ try_again:  	t = f = 0;  	if (PagePrivate(page)) {  		priv = page_private(page); -		f = priv & AFS_PRIV_MAX; -		t = priv >> AFS_PRIV_SHIFT; +		f = afs_page_dirty_from(priv); +		t = afs_page_dirty_to(priv);  		ASSERTCMP(f, <=, t);  	} @@ -138,21 +130,9 @@ try_again:  		if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&  		    (to < f || from > t))  			goto flush_conflicting_write; -		if (from < f) -			f = from; -		if (to > t) -			t = to; -	} else { -		f = from; -		t = to;  	} -	priv = (unsigned long)t << AFS_PRIV_SHIFT; -	priv |= f; -	trace_afs_page_dirty(vnode, tracepoint_string("begin"), -			     page->index, priv); -	SetPagePrivate(page); -	set_page_private(page, priv); +	*_page = page;  	_leave(" = 0");  	return 0; @@ -162,17 +142,18 @@ try_again:  flush_conflicting_write:  	_debug("flush conflict");  	ret = write_one_page(page); -	if (ret < 0) { -		_leave(" = %d", ret); -		return ret; -	} +	if (ret < 0) +		goto error;  	ret = lock_page_killable(page); -	if (ret < 0) { -		_leave(" = %d", ret); -		return ret; -	} +	if (ret < 0) +		goto error;  	goto try_again; + +error: +	put_page(page); +	_leave(" = %d", ret); +	return ret;  }  /* @@ -184,12 +165,18 @@ int afs_write_end(struct file *file, struct address_space *mapping,  {  	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));  	struct key *key = afs_file_key(file); +	unsigned long priv; +	unsigned int f, from = pos & (PAGE_SIZE - 1); +	unsigned int t, to = from + copied;  	loff_t i_size, maybe_i_size; -	int ret; +	int ret = 0;  	_enter("{%llx:%llu},{%lx}",  	       vnode->fid.vid, vnode->fid.vnode, page->index); +	if (copied == 0) +		goto out; +  	maybe_i_size = pos + copied;  	i_size = i_size_read(&vnode->vfs_inode); @@ -215,6 +202,25 @@ int afs_write_end(struct file *file, struct address_space *mapping,  		SetPageUptodate(page);  	} +	if (PagePrivate(page)) { +		priv = page_private(page); +		f = afs_page_dirty_from(priv); +		t = afs_page_dirty_to(priv); +		if (from < f) +			f = from; +		if (to > t) +			t = to; +		priv = afs_page_dirty(f, t); +		set_page_private(page, priv); +		trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), +				     page->index, priv); +	} else { +		priv = afs_page_dirty(from, to); +		attach_page_private(page, (void *)priv); +		trace_afs_page_dirty(vnode, tracepoint_string("dirty"), +				     page->index, priv); +	} +  	set_page_dirty(page);  	if (PageDirty(page))  		_debug("dirtied"); @@ -334,10 +340,9 @@ static void afs_pages_written_back(struct afs_vnode *vnode,  		ASSERTCMP(pv.nr, ==, count);  		for (loop = 0; loop < count; loop++) { -			priv = page_private(pv.pages[loop]); +			priv = (unsigned long)detach_page_private(pv.pages[loop]);  			trace_afs_page_dirty(vnode, tracepoint_string("clear"),  					     pv.pages[loop]->index, priv); -			set_page_private(pv.pages[loop], 0);  			end_page_writeback(pv.pages[loop]);  		}  		first += count; @@ -396,7 +401,8 @@ static void afs_store_data_success(struct afs_operation *op)  	op->ctime = op->file[0].scb.status.mtime_client;  	afs_vnode_commit_status(op, &op->file[0]);  	if (op->error == 0) { -		afs_pages_written_back(vnode, op->store.first, op->store.last); +		if (!op->store.laundering) +			afs_pages_written_back(vnode, op->store.first, op->store.last);  		afs_stat_v(vnode, n_stores);  		atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) -  				(op->store.first * PAGE_SIZE + op->store.first_offset), @@ -415,7 +421,7 @@ static const struct afs_operation_ops afs_store_data_operation = {   */  static int afs_store_data(struct address_space *mapping,  			  pgoff_t first, pgoff_t last, -			  unsigned offset, unsigned to) +			  unsigned offset, unsigned to, bool laundering)  {  	struct afs_vnode *vnode = AFS_FS_I(mapping->host);  	struct afs_operation *op; @@ -448,6 +454,7 @@ static int afs_store_data(struct address_space *mapping,  	op->store.last = last;  	op->store.first_offset = offset;  	op->store.last_to = to; +	op->store.laundering = laundering;  	op->mtime = vnode->vfs_inode.i_mtime;  	op->flags |= AFS_OPERATION_UNINTR;  	op->ops = &afs_store_data_operation; @@ -509,8 +516,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,  	 */  	start = primary_page->index;  	priv = page_private(primary_page); -	offset = priv & AFS_PRIV_MAX; -	to = priv >> AFS_PRIV_SHIFT; +	offset = afs_page_dirty_from(priv); +	to = afs_page_dirty_to(priv);  	trace_afs_page_dirty(vnode, tracepoint_string("store"),  			     primary_page->index, priv); @@ -555,8 +562,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,  			}  			priv = page_private(page); -			f = priv & AFS_PRIV_MAX; -			t = priv >> AFS_PRIV_SHIFT; +			f = afs_page_dirty_from(priv); +			t = afs_page_dirty_to(priv);  			if (f != 0 &&  			    !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) {  				unlock_page(page); @@ -601,7 +608,7 @@ no_more:  	if (end > i_size)  		to = i_size & ~PAGE_MASK; -	ret = afs_store_data(mapping, first, last, offset, to); +	ret = afs_store_data(mapping, first, last, offset, to, false);  	switch (ret) {  	case 0:  		ret = count; @@ -857,12 +864,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)  	 */  	wait_on_page_writeback(vmf->page); -	priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */ -	priv |= 0; /* From */ +	priv = afs_page_dirty(0, PAGE_SIZE); +	priv = afs_page_dirty_mmapped(priv);  	trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),  			     vmf->page->index, priv); -	SetPagePrivate(vmf->page); -	set_page_private(vmf->page, priv); +	if (PagePrivate(vmf->page)) +		set_page_private(vmf->page, priv); +	else +		attach_page_private(vmf->page, (void *)priv);  	file_update_time(file);  	sb_end_pagefault(inode->i_sb); @@ -915,19 +924,18 @@ int afs_launder_page(struct page *page)  		f = 0;  		t = PAGE_SIZE;  		if (PagePrivate(page)) { -			f = priv & AFS_PRIV_MAX; -			t = priv >> AFS_PRIV_SHIFT; +			f = afs_page_dirty_from(priv); +			t = afs_page_dirty_to(priv);  		}  		trace_afs_page_dirty(vnode, tracepoint_string("launder"),  				     page->index, priv); -		ret = afs_store_data(mapping, page->index, page->index, t, f); +		ret = afs_store_data(mapping, page->index, page->index, t, f, true);  	} +	priv = (unsigned long)detach_page_private(page);  	trace_afs_page_dirty(vnode, tracepoint_string("laundered"),  			     page->index, priv); -	set_page_private(page, 0); -	ClearPagePrivate(page);  #ifdef CONFIG_AFS_FSCACHE  	if (PageFsCache(page)) { diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 84f3c4f57531..95c573dcda11 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -85,7 +85,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler,  			if (acl->size <= size)  				memcpy(buffer, acl->data, acl->size);  			else -				op->error = -ERANGE; +				ret = -ERANGE;  		}  	} @@ -148,11 +148,6 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = {  	.set    = afs_xattr_set_acl,  }; -static void yfs_acl_put(struct afs_operation *op) -{ -	yfs_free_opaque_acl(op->yacl); -} -  static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = {  	.issue_yfs_rpc	= yfs_fs_fetch_opaque_acl,  	.success	= afs_acl_success, @@ -246,7 +241,7 @@ error:  static const struct afs_operation_ops yfs_store_opaque_acl2_operation = {  	.issue_yfs_rpc	= yfs_fs_store_opaque_acl2,  	.success	= afs_acl_success, -	.put		= yfs_acl_put, +	.put		= afs_acl_put,  };  /* diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 3b1239b7e90d..bd787e71a657 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1990,6 +1990,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op)  	memcpy(bp, acl->data, acl->size);  	if (acl->size != size)  		memset((void *)bp + acl->size, 0, size - acl->size); +	bp += size / sizeof(__be32);  	yfs_check_req(call, bp);  	trace_afs_make_fs_call(call, &vp->fid); @@ -1572,7 +1572,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,  		 * we return to userspace.  		 */  		if (S_ISREG(file_inode(file)->i_mode)) { -			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); +			sb_start_write(file_inode(file)->i_sb);  			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);  		}  		req->ki_flags |= IOCB_WRITE; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b6b3d052ca86..fa50e8936f5f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1690,7 +1690,7 @@ struct elf_thread_core_info {  	struct elf_thread_core_info *next;  	struct task_struct *task;  	struct elf_prstatus prstatus; -	struct memelfnote notes[0]; +	struct memelfnote notes[];  };  struct elf_note_info { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b3268f4ea5f3..771a036867dc 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -544,7 +544,18 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,  	int level = ref->level;  	struct btrfs_key search_key = ref->key_for_search; -	root = btrfs_get_fs_root(fs_info, ref->root_id, false); +	/* +	 * If we're search_commit_root we could possibly be holding locks on +	 * other tree nodes.  This happens when qgroups does backref walks when +	 * adding new delayed refs.  To deal with this we need to look in cache +	 * for the root, and if we don't find it then we need to search the +	 * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage +	 * here. +	 */ +	if (path->search_commit_root) +		root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); +	else +		root = btrfs_get_fs_root(fs_info, ref->root_id, false);  	if (IS_ERR(root)) {  		ret = PTR_ERR(root);  		goto out_free; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0f1d6818df7..3ba6f3839d39 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2024,6 +2024,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)  		key.offset = 0;  		btrfs_release_path(path);  	} +	btrfs_release_path(path);  	list_for_each_entry(space_info, &info->space_info, list) {  		int i; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 7e1549a84fcc..bc920afe23bf 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -511,7 +511,8 @@ again:  				/*DEFAULT_RATELIMIT_BURST*/ 1);  		if (__ratelimit(&_rs))  			WARN(1, KERN_DEBUG -				"BTRFS: block rsv returned %d\n", ret); +				"BTRFS: block rsv %d returned %d\n", +				block_rsv->type, ret);  	}  try_reserve:  	ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aac3d6f4e35b..0b29bdb25105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -878,7 +878,10 @@ struct btrfs_fs_info {  	 */  	struct ulist *qgroup_ulist; -	/* protect user change for quota operations */ +	/* +	 * Protect user change for quota operations. If a transaction is needed, +	 * it must be started before locking this lock. +	 */  	struct mutex qgroup_ioctl_lock;  	/* list of dirty qgroups to be written at next commit */ @@ -3564,6 +3567,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,  int btrfs_reada_wait(void *handle);  void btrfs_reada_detach(void *handle);  int btree_readahead_hook(struct extent_buffer *eb, int err); +void btrfs_reada_remove_dev(struct btrfs_device *dev); +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);  static inline int is_fstree(u64 rootid)  { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4a0243cb9d97..10638537b9ef 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)  	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);  	if (ret) {  no_valid_dev_replace_entry_found: +		/* +		 * We don't have a replace item or it's corrupted.  If there is +		 * a replace target, fail the mount. +		 */ +		if (btrfs_find_device(fs_info->fs_devices, +				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { +			btrfs_err(fs_info, +			"found replace target device without a valid replace item"); +			ret = -EUCLEAN; +			goto out; +		}  		ret = 0;  		dev_replace->replace_state =  			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: -		dev_replace->srcdev = NULL; -		dev_replace->tgtdev = NULL; +		/* +		 * We don't have an active replace item but if there is a +		 * replace target, fail the mount. +		 */ +		if (btrfs_find_device(fs_info->fs_devices, +				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { +			btrfs_err(fs_info, +			"replace devid present without an active replace item"); +			ret = -EUCLEAN; +		} else { +			dev_replace->srcdev = NULL; +			dev_replace->tgtdev = NULL; +		}  		break;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: @@ -688,6 +710,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	}  	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); +	if (!scrub_ret) +		btrfs_reada_remove_dev(src_device); +  	/*  	 * We have to use this loop approach because at this point src_device  	 * has to be available for transaction commit to complete, yet new @@ -696,6 +721,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	while (1) {  		trans = btrfs_start_transaction(root, 0);  		if (IS_ERR(trans)) { +			btrfs_reada_undo_remove_dev(src_device);  			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  			return PTR_ERR(trans);  		} @@ -746,6 +772,7 @@ error:  		up_write(&dev_replace->rwsem);  		mutex_unlock(&fs_info->chunk_mutex);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		btrfs_reada_undo_remove_dev(src_device);  		btrfs_rm_dev_replace_blocked(fs_info);  		if (tgt_device)  			btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e3438672a82..af97ddcc6b3e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1281,32 +1281,26 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,  	return 0;  } -struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, -					struct btrfs_key *key) +static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, +					      struct btrfs_path *path, +					      struct btrfs_key *key)  {  	struct btrfs_root *root;  	struct btrfs_fs_info *fs_info = tree_root->fs_info; -	struct btrfs_path *path;  	u64 generation;  	int ret;  	int level; -	path = btrfs_alloc_path(); -	if (!path) -		return ERR_PTR(-ENOMEM); -  	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); -	if (!root) { -		ret = -ENOMEM; -		goto alloc_fail; -	} +	if (!root) +		return ERR_PTR(-ENOMEM);  	ret = btrfs_find_root(tree_root, key, path,  			      &root->root_item, &root->root_key);  	if (ret) {  		if (ret > 0)  			ret = -ENOENT; -		goto find_fail; +		goto fail;  	}  	generation = btrfs_root_generation(&root->root_item); @@ -1317,21 +1311,31 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,  	if (IS_ERR(root->node)) {  		ret = PTR_ERR(root->node);  		root->node = NULL; -		goto find_fail; +		goto fail;  	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {  		ret = -EIO; -		goto find_fail; +		goto fail;  	}  	root->commit_root = btrfs_root_node(root); -out: -	btrfs_free_path(path);  	return root; - -find_fail: +fail:  	btrfs_put_root(root); -alloc_fail: -	root = ERR_PTR(ret); -	goto out; +	return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, +					struct btrfs_key *key) +{ +	struct btrfs_root *root; +	struct btrfs_path *path; + +	path = btrfs_alloc_path(); +	if (!path) +		return ERR_PTR(-ENOMEM); +	root = read_tree_root_path(tree_root, path, key); +	btrfs_free_path(path); + +	return root;  }  /* @@ -1419,6 +1423,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,  	return root;  } +static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, +						u64 objectid) +{ +	if (objectid == BTRFS_ROOT_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->tree_root); +	if (objectid == BTRFS_EXTENT_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->extent_root); +	if (objectid == BTRFS_CHUNK_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->chunk_root); +	if (objectid == BTRFS_DEV_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->dev_root); +	if (objectid == BTRFS_CSUM_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->csum_root); +	if (objectid == BTRFS_QUOTA_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->quota_root) ? +			fs_info->quota_root : ERR_PTR(-ENOENT); +	if (objectid == BTRFS_UUID_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->uuid_root) ? +			fs_info->uuid_root : ERR_PTR(-ENOENT); +	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) +		return btrfs_grab_root(fs_info->free_space_root) ? +			fs_info->free_space_root : ERR_PTR(-ENOENT); +	return NULL; +} +  int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  			 struct btrfs_root *root)  { @@ -1518,25 +1547,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,  	struct btrfs_key key;  	int ret; -	if (objectid == BTRFS_ROOT_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->tree_root); -	if (objectid == BTRFS_EXTENT_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->extent_root); -	if (objectid == BTRFS_CHUNK_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->chunk_root); -	if (objectid == BTRFS_DEV_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->dev_root); -	if (objectid == BTRFS_CSUM_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->csum_root); -	if (objectid == BTRFS_QUOTA_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->quota_root) ? -			fs_info->quota_root : ERR_PTR(-ENOENT); -	if (objectid == BTRFS_UUID_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->uuid_root) ? -			fs_info->uuid_root : ERR_PTR(-ENOENT); -	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) -		return btrfs_grab_root(fs_info->free_space_root) ? -			fs_info->free_space_root : ERR_PTR(-ENOENT); +	root = btrfs_get_global_root(fs_info, objectid); +	if (root) +		return root;  again:  	root = btrfs_lookup_fs_root(fs_info, objectid);  	if (root) { @@ -1622,6 +1635,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,  }  /* + * btrfs_get_fs_root_commit_root - return a root for the given objectid + * @fs_info:	the fs_info + * @objectid:	the objectid we need to lookup + * + * This is exclusively used for backref walking, and exists specifically because + * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref + * creation time, which means we may have to read the tree_root in order to look + * up a fs root that is not in memory.  If the root is not in memory we will + * read the tree root commit root and look up the fs root from there.  This is a + * temporary root, it will not be inserted into the radix tree as it doesn't + * have the most uptodate information, it'll simply be discarded once the + * backref code is finished using the root. + */ +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, +						 struct btrfs_path *path, +						 u64 objectid) +{ +	struct btrfs_root *root; +	struct btrfs_key key; + +	ASSERT(path->search_commit_root && path->skip_locking); + +	/* +	 * This can return -ENOENT if we ask for a root that doesn't exist, but +	 * since this is called via the backref walking code we won't be looking +	 * up a root that doesn't exist, unless there's corruption.  So if root +	 * != NULL just return it. +	 */ +	root = btrfs_get_global_root(fs_info, objectid); +	if (root) +		return root; + +	root = btrfs_lookup_fs_root(fs_info, objectid); +	if (root) +		return root; + +	key.objectid = objectid; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; +	root = read_tree_root_path(fs_info->tree_root, path, &key); +	btrfs_release_path(path); + +	return root; +} + +/*   * called by the kthread helper functions to finally call the bio end_io   * functions.  This is where read checksum verification actually happens   */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index fee69ced58b4..182540bdcea0 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -69,6 +69,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,  				     u64 objectid, bool check_ref);  struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,  					 u64 objectid, dev_t anon_dev); +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, +						 struct btrfs_path *path, +						 u64 objectid);  void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b21fee13e77..5fd60b13f4f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3185,7 +3185,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		struct btrfs_tree_block_info *bi;  		if (item_size < sizeof(*ei) + sizeof(*bi)) {  			btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",  				   key.objectid, key.type, key.offset,  				   owner_objectid, item_size,  				   sizeof(*ei) + sizeof(*bi)); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0ff659455b1e..4373da7bcc0d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)  	}  } -static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, -					 const u64 start, -					 const u64 len, -					 struct extent_state **cached_state) -{ -	u64 search_start = start; -	const u64 end = start + len - 1; - -	while (search_start < end) { -		const u64 search_len = end - search_start + 1; -		struct extent_map *em; -		u64 em_len; -		int ret = 0; - -		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); -		if (IS_ERR(em)) -			return PTR_ERR(em); - -		if (em->block_start != EXTENT_MAP_HOLE) -			goto next; - -		em_len = em->len; -		if (em->start < search_start) -			em_len -= search_start - em->start; -		if (em_len > search_len) -			em_len = search_len; - -		ret = set_extent_bit(&inode->io_tree, search_start, -				     search_start + em_len - 1, -				     EXTENT_DELALLOC_NEW, -				     NULL, cached_state, GFP_NOFS); -next: -		search_start = extent_map_end(em); -		free_extent_map(em); -		if (ret) -			return ret; -	} -	return 0; -} -  /*   * after copy_from_user, pages need to be dirtied and we need to make   * sure holes are created between the current EOF and the start of @@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,  			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			 0, 0, cached); -	if (!btrfs_is_free_space_inode(inode)) { -		if (start_pos >= isize && -		    !(inode->flags & BTRFS_INODE_PREALLOC)) { -			/* -			 * There can't be any extents following eof in this case -			 * so just set the delalloc new bit for the range -			 * directly. -			 */ -			extra_bits |= EXTENT_DELALLOC_NEW; -		} else { -			err = btrfs_find_new_delalloc_bytes(inode, start_pos, -							    num_bytes, cached); -			if (err) -				return err; -		} -	} -  	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,  					extra_bits, cached);  	if (err) @@ -3628,7 +3571,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)  		inode_lock_shared(inode);  		ret = btrfs_direct_IO(iocb, to);  		inode_unlock_shared(inode); -		if (ret < 0) +		if (ret < 0 || !iov_iter_count(to) || +		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))  			return ret;  	} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 936c3137c646..7e8d8169779d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2253,11 +2253,69 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,  	return 0;  } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, +					 const u64 start, +					 const u64 len, +					 struct extent_state **cached_state) +{ +	u64 search_start = start; +	const u64 end = start + len - 1; + +	while (search_start < end) { +		const u64 search_len = end - search_start + 1; +		struct extent_map *em; +		u64 em_len; +		int ret = 0; + +		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); +		if (IS_ERR(em)) +			return PTR_ERR(em); + +		if (em->block_start != EXTENT_MAP_HOLE) +			goto next; + +		em_len = em->len; +		if (em->start < search_start) +			em_len -= search_start - em->start; +		if (em_len > search_len) +			em_len = search_len; + +		ret = set_extent_bit(&inode->io_tree, search_start, +				     search_start + em_len - 1, +				     EXTENT_DELALLOC_NEW, +				     NULL, cached_state, GFP_NOFS); +next: +		search_start = extent_map_end(em); +		free_extent_map(em); +		if (ret) +			return ret; +	} +	return 0; +} +  int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,  			      unsigned int extra_bits,  			      struct extent_state **cached_state)  {  	WARN_ON(PAGE_ALIGNED(end)); + +	if (start >= i_size_read(&inode->vfs_inode) && +	    !(inode->flags & BTRFS_INODE_PREALLOC)) { +		/* +		 * There can't be any extents following eof in this case so just +		 * set the delalloc new bit for the range directly. +		 */ +		extra_bits |= EXTENT_DELALLOC_NEW; +	} else { +		int ret; + +		ret = btrfs_find_new_delalloc_bytes(inode, start, +						    end + 1 - start, +						    cached_state); +		if (ret) +			return ret; +	} +  	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,  				   cached_state);  } @@ -9672,10 +9730,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		 * clear_offset by our extent size.  		 */  		clear_offset += ins.offset; -		btrfs_dec_block_group_reservations(fs_info, ins.objectid);  		last_alloc = ins.offset;  		trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); +		/* +		 * Now that we inserted the prealloc extent we can finally +		 * decrement the number of reservations in the block group. +		 * If we did it before, we could race with relocation and have +		 * relocation miss the reserved extent, making it fail later. +		 */ +		btrfs_dec_block_group_reservations(fs_info, ins.objectid);  		if (IS_ERR(trans)) {  			ret = PTR_ERR(trans);  			btrfs_free_reserved_extent(fs_info, ins.objectid, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab408a23ba32..69a384145dc6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode,  	u64 page_start;  	u64 page_end;  	u64 page_cnt; +	u64 start = (u64)start_index << PAGE_SHIFT;  	int ret;  	int i;  	int i_done; @@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode,  	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);  	ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, -			start_index << PAGE_SHIFT, -			page_cnt << PAGE_SHIFT); +			start, page_cnt << PAGE_SHIFT);  	if (ret)  		return ret;  	i_done = 0; @@ -1380,8 +1380,7 @@ again:  		btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);  		spin_unlock(&BTRFS_I(inode)->lock);  		btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, -				start_index << PAGE_SHIFT, -				(page_cnt - i_done) << PAGE_SHIFT, true); +				start, (page_cnt - i_done) << PAGE_SHIFT, true);  	} @@ -1408,8 +1407,7 @@ out:  		put_page(pages[i]);  	}  	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, -			start_index << PAGE_SHIFT, -			page_cnt << PAGE_SHIFT, true); +			start, page_cnt << PAGE_SHIFT, true);  	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);  	extent_changeset_free(data_reserved);  	return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 580899bdb991..87bd37b70738 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,6 +11,7 @@  #include <linux/slab.h>  #include <linux/workqueue.h>  #include <linux/btrfs.h> +#include <linux/sched/mm.h>  #include "ctree.h"  #include "transaction.h" @@ -497,13 +498,13 @@ next2:  			break;  	}  out: +	btrfs_free_path(path);  	fs_info->qgroup_flags |= flags;  	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))  		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);  	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&  		 ret >= 0)  		ret = qgroup_rescan_init(fs_info, rescan_progress, 0); -	btrfs_free_path(path);  	if (ret < 0) {  		ulist_free(fs_info->qgroup_ulist); @@ -936,6 +937,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  	struct btrfs_key found_key;  	struct btrfs_qgroup *qgroup = NULL;  	struct btrfs_trans_handle *trans = NULL; +	struct ulist *ulist = NULL;  	int ret = 0;  	int slot; @@ -943,8 +945,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  	if (fs_info->quota_root)  		goto out; -	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); -	if (!fs_info->qgroup_ulist) { +	ulist = ulist_alloc(GFP_KERNEL); +	if (!ulist) {  		ret = -ENOMEM;  		goto out;  	} @@ -952,6 +954,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  	ret = btrfs_sysfs_add_qgroups(fs_info);  	if (ret < 0)  		goto out; + +	/* +	 * Unlock qgroup_ioctl_lock before starting the transaction. This is to +	 * avoid lock acquisition inversion problems (reported by lockdep) between +	 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we +	 * start a transaction. +	 * After we started the transaction lock qgroup_ioctl_lock again and +	 * check if someone else created the quota root in the meanwhile. If so, +	 * just return success and release the transaction handle. +	 * +	 * Also we don't need to worry about someone else calling +	 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because +	 * that function returns 0 (success) when the sysfs entries already exist. +	 */ +	mutex_unlock(&fs_info->qgroup_ioctl_lock); +  	/*  	 * 1 for quota root item  	 * 1 for BTRFS_QGROUP_STATUS item @@ -961,12 +979,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  	 * would be a lot of overkill.  	 */  	trans = btrfs_start_transaction(tree_root, 2); + +	mutex_lock(&fs_info->qgroup_ioctl_lock);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans);  		trans = NULL;  		goto out;  	} +	if (fs_info->quota_root) +		goto out; + +	fs_info->qgroup_ulist = ulist; +	ulist = NULL; +  	/*  	 * initially create the quota tree  	 */ @@ -1026,6 +1052,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  		btrfs_item_key_to_cpu(leaf, &found_key, slot);  		if (found_key.type == BTRFS_ROOT_REF_KEY) { + +			/* Release locks on tree_root before we access quota_root */ +			btrfs_release_path(path); +  			ret = add_qgroup_item(trans, quota_root,  					      found_key.offset);  			if (ret) { @@ -1044,6 +1074,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)  				btrfs_abort_transaction(trans, ret);  				goto out_free_path;  			} +			ret = btrfs_search_slot_for_read(tree_root, &found_key, +							 path, 1, 0); +			if (ret < 0) { +				btrfs_abort_transaction(trans, ret); +				goto out_free_path; +			} +			if (ret > 0) { +				/* +				 * Shouldn't happen, but in case it does we +				 * don't need to do the btrfs_next_item, just +				 * continue. +				 */ +				continue; +			}  		}  		ret = btrfs_next_item(tree_root, path);  		if (ret < 0) { @@ -1106,11 +1150,14 @@ out:  	if (ret) {  		ulist_free(fs_info->qgroup_ulist);  		fs_info->qgroup_ulist = NULL; -		if (trans) -			btrfs_end_transaction(trans);  		btrfs_sysfs_del_qgroups(fs_info);  	}  	mutex_unlock(&fs_info->qgroup_ioctl_lock); +	if (ret && trans) +		btrfs_end_transaction(trans); +	else if (trans) +		ret = btrfs_end_transaction(trans); +	ulist_free(ulist);  	return ret;  } @@ -1123,19 +1170,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)  	mutex_lock(&fs_info->qgroup_ioctl_lock);  	if (!fs_info->quota_root)  		goto out; +	mutex_unlock(&fs_info->qgroup_ioctl_lock);  	/*  	 * 1 For the root item  	 *  	 * We should also reserve enough items for the quota tree deletion in  	 * btrfs_clean_quota_tree but this is not done. +	 * +	 * Also, we must always start a transaction without holding the mutex +	 * qgroup_ioctl_lock, see btrfs_quota_enable().  	 */  	trans = btrfs_start_transaction(fs_info->tree_root, 1); + +	mutex_lock(&fs_info->qgroup_ioctl_lock);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans); +		trans = NULL;  		goto out;  	} +	if (!fs_info->quota_root) +		goto out; +  	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);  	btrfs_qgroup_wait_for_completion(fs_info, false);  	spin_lock(&fs_info->qgroup_lock); @@ -1149,13 +1206,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)  	ret = btrfs_clean_quota_tree(trans, quota_root);  	if (ret) {  		btrfs_abort_transaction(trans, ret); -		goto end_trans; +		goto out;  	}  	ret = btrfs_del_root(trans, "a_root->root_key);  	if (ret) {  		btrfs_abort_transaction(trans, ret); -		goto end_trans; +		goto out;  	}  	list_del("a_root->dirty_list); @@ -1167,10 +1224,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)  	btrfs_put_root(quota_root); -end_trans: -	ret = btrfs_end_transaction(trans);  out:  	mutex_unlock(&fs_info->qgroup_ioctl_lock); +	if (ret && trans) +		btrfs_end_transaction(trans); +	else if (trans) +		ret = btrfs_end_transaction(trans); +  	return ret;  } @@ -1306,13 +1366,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,  	struct btrfs_qgroup *member;  	struct btrfs_qgroup_list *list;  	struct ulist *tmp; +	unsigned int nofs_flag;  	int ret = 0;  	/* Check the level of src and dst first */  	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))  		return -EINVAL; +	/* We hold a transaction handle open, must do a NOFS allocation. */ +	nofs_flag = memalloc_nofs_save();  	tmp = ulist_alloc(GFP_KERNEL); +	memalloc_nofs_restore(nofs_flag);  	if (!tmp)  		return -ENOMEM; @@ -1369,10 +1433,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,  	struct btrfs_qgroup_list *list;  	struct ulist *tmp;  	bool found = false; +	unsigned int nofs_flag;  	int ret = 0;  	int ret2; +	/* We hold a transaction handle open, must do a NOFS allocation. */ +	nofs_flag = memalloc_nofs_save();  	tmp = ulist_alloc(GFP_KERNEL); +	memalloc_nofs_restore(nofs_flag);  	if (!tmp)  		return -ENOMEM; @@ -3417,24 +3485,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,  {  	struct rb_node *node;  	struct rb_node *next; -	struct ulist_node *entry = NULL; +	struct ulist_node *entry;  	int ret = 0;  	node = reserved->range_changed.root.rb_node; +	if (!node) +		return 0;  	while (node) {  		entry = rb_entry(node, struct ulist_node, rb_node);  		if (entry->val < start)  			node = node->rb_right; -		else if (entry) -			node = node->rb_left;  		else -			break; +			node = node->rb_left;  	} -	/* Empty changeset */ -	if (!entry) -		return 0; -  	if (entry->val > start && rb_prev(&entry->rb_node))  		entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,  				 rb_node); @@ -3498,6 +3562,7 @@ static int try_flush_qgroup(struct btrfs_root *root)  {  	struct btrfs_trans_handle *trans;  	int ret; +	bool can_commit = true;  	/*  	 * We don't want to run flush again and again, so if there is a running @@ -3509,6 +3574,20 @@ static int try_flush_qgroup(struct btrfs_root *root)  		return 0;  	} +	/* +	 * If current process holds a transaction, we shouldn't flush, as we +	 * assume all space reservation happens before a transaction handle is +	 * held. +	 * +	 * But there are cases like btrfs_delayed_item_reserve_metadata() where +	 * we try to reserve space with one transction handle already held. +	 * In that case we can't commit transaction, but at least try to end it +	 * and hope the started data writes can free some space. +	 */ +	if (current->journal_info && +	    current->journal_info != BTRFS_SEND_TRANS_STUB) +		can_commit = false; +  	ret = btrfs_start_delalloc_snapshot(root);  	if (ret < 0)  		goto out; @@ -3520,7 +3599,10 @@ static int try_flush_qgroup(struct btrfs_root *root)  		goto out;  	} -	ret = btrfs_commit_transaction(trans); +	if (can_commit) +		ret = btrfs_commit_transaction(trans); +	else +		ret = btrfs_end_transaction(trans);  out:  	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);  	wake_up(&root->qgroup_flush_wait); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9d4f5316a7e8..d9a166eb344e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -421,6 +421,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,  		if (!dev->bdev)  			continue; +		if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) +			continue; +  		if (dev_replace_is_ongoing &&  		    dev == fs_info->dev_replace.tgtdev) {  			/* @@ -445,6 +448,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,  		}  		have_zone = 1;  	} +	if (!have_zone) +		radix_tree_delete(&fs_info->reada_tree, index);  	spin_unlock(&fs_info->reada_lock);  	up_read(&fs_info->dev_replace.rwsem); @@ -1020,3 +1025,45 @@ void btrfs_reada_detach(void *handle)  	kref_put(&rc->refcnt, reada_control_release);  } + +/* + * Before removing a device (device replace or device remove ioctls), call this + * function to wait for all existing readahead requests on the device and to + * make sure no one queues more readahead requests for the device. + * + * Must be called without holding neither the device list mutex nor the device + * replace semaphore, otherwise it will deadlock. + */ +void btrfs_reada_remove_dev(struct btrfs_device *dev) +{ +	struct btrfs_fs_info *fs_info = dev->fs_info; + +	/* Serialize with readahead extent creation at reada_find_extent(). */ +	spin_lock(&fs_info->reada_lock); +	set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); +	spin_unlock(&fs_info->reada_lock); + +	/* +	 * There might be readahead requests added to the radix trees which +	 * were not yet added to the readahead work queue. We need to start +	 * them and wait for their completion, otherwise we can end up with +	 * use-after-free problems when dropping the last reference on the +	 * readahead extents and their zones, as they need to access the +	 * device structure. +	 */ +	reada_start_machine(fs_info); +	btrfs_flush_workqueue(fs_info->readahead_workers); +} + +/* + * If when removing a device (device replace or device remove ioctls) an error + * happens after calling btrfs_reada_remove_dev(), call this to undo what that + * function did. This is safe to call even if btrfs_reada_remove_dev() was not + * called before. + */ +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) +{ +	spin_lock(&dev->fs_info->reada_lock); +	clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); +	spin_unlock(&dev->fs_info->reada_lock); +} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 7f03dbe5b609..78693d3dd15b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,  "dropping a ref for a root that doesn't have a ref on the block");  			dump_block_entry(fs_info, be);  			dump_ref_action(fs_info, ra); +			kfree(ref);  			kfree(ra);  			goto out_unlock;  		} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3602806d71bd..9ba92d86da0b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  	struct btrfs_root_item *root_item;  	struct btrfs_path *path;  	struct extent_buffer *leaf; +	int reserve_level;  	int level;  	int max_level;  	int replaced = 0; @@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  	 * Thus the needed metadata size is at most root_level * nodesize,  	 * and * 2 since we have two trees to COW.  	 */ -	min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; +	reserve_level = max_t(int, 1, btrfs_root_level(root_item)); +	min_reserved = fs_info->nodesize * reserve_level * 2;  	memset(&next_key, 0, sizeof(next_key));  	while (1) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf63f1e27a27..e71e7586e9eb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	if (!is_dev_replace && !readonly &&  	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", -				rcu_str_deref(dev->name)); +		btrfs_err_in_rcu(fs_info, +			"scrub on devid %llu: filesystem on %s is not writable", +				 devid, rcu_str_deref(dev->name));  		ret = -EROFS;  		goto out;  	} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index e6719f7db386..04022069761d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -983,7 +983,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)  	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,  			       BTRFS_MAX_EXTENT_SIZE >> 1,  			       (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, -			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); +			       EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | +			       EXTENT_UPTODATE, 0, 0, NULL);  	if (ret) {  		test_err("clear_extent_bit returned %d", ret);  		goto out; @@ -1050,7 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)  	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,  			       BTRFS_MAX_EXTENT_SIZE + sectorsize,  			       BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, -			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); +			       EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | +			       EXTENT_UPTODATE, 0, 0, NULL);  	if (ret) {  		test_err("clear_extent_bit returned %d", ret);  		goto out; @@ -1082,7 +1084,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)  	/* Empty */  	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, -			       EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); +			       EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | +			       EXTENT_UPTODATE, 0, 0, NULL);  	if (ret) {  		test_err("clear_extent_bit returned %d", ret);  		goto out; @@ -1097,7 +1100,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)  out:  	if (ret)  		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, -				 EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); +				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | +				 EXTENT_UPTODATE, 0, 0, NULL);  	iput(inode);  	btrfs_free_dummy_root(root);  	btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f0ffd5ee77bd..ea2bb4cb5890 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,  	u64 type;  	u64 features;  	bool mixed = false; +	int raid_index; +	int nparity; +	int ncopies;  	length = btrfs_chunk_length(leaf, chunk);  	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);  	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);  	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);  	type = btrfs_chunk_type(leaf, chunk); +	raid_index = btrfs_bg_flags_to_raid_index(type); +	ncopies = btrfs_raid_array[raid_index].ncopies; +	nparity = btrfs_raid_array[raid_index].nparity;  	if (!num_stripes) {  		chunk_err(leaf, chunk, logical,  			  "invalid chunk num_stripes, have %u", num_stripes);  		return -EUCLEAN;  	} +	if (num_stripes < ncopies) { +		chunk_err(leaf, chunk, logical, +			  "invalid chunk num_stripes < ncopies, have %u < %d", +			  num_stripes, ncopies); +		return -EUCLEAN; +	} +	if (nparity && num_stripes == nparity) { +		chunk_err(leaf, chunk, logical, +			  "invalid chunk num_stripes == nparity, have %u == %d", +			  num_stripes, nparity); +		return -EUCLEAN; +	}  	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {  		chunk_err(leaf, chunk, logical,  		"invalid chunk logical, have %llu should aligned to %u", @@ -1050,6 +1068,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,  			    "invalid root item size, have %u expect %zu or %u",  			    btrfs_item_size_nr(leaf, slot), sizeof(ri),  			    btrfs_legacy_root_item_size()); +		return -EUCLEAN;  	}  	/* @@ -1405,6 +1424,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,  	"invalid item size, have %u expect aligned to %zu for key type %u",  			    btrfs_item_size_nr(leaf, slot),  			    sizeof(*dref), key->type); +		return -EUCLEAN;  	}  	if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {  		generic_err(leaf, slot, @@ -1433,6 +1453,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,  			extent_err(leaf, slot,  	"invalid extent data backref offset, have %llu expect aligned to %u",  				   offset, leaf->fs_info->sectorsize); +			return -EUCLEAN;  		}  	}  	return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 58b9c419a2b6..78637665166e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -431,7 +431,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)  	atomic_set(&dev->reada_in_flight, 0);  	atomic_set(&dev->dev_stats_ccnt, 0); -	btrfs_device_data_ordered_init(dev); +	btrfs_device_data_ordered_init(dev, fs_info);  	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);  	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);  	extent_io_tree_init(fs_info, &dev->alloc_state, @@ -940,7 +940,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,  			if (device->bdev != path_bdev) {  				bdput(path_bdev);  				mutex_unlock(&fs_devices->device_list_mutex); -				btrfs_warn_in_rcu(device->fs_info, +				/* +				 * device->fs_info may not be reliable here, so +				 * pass in a NULL instead. This avoids a +				 * possible use-after-free when the fs_info and +				 * fs_info->sb are already torn down. +				 */ +				btrfs_warn_in_rcu(NULL,  	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",  						  path, devid, found_transid,  						  current->comm, @@ -1056,22 +1062,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,  			continue;  		} -		if (device->devid == BTRFS_DEV_REPLACE_DEVID) { -			/* -			 * In the first step, keep the device which has -			 * the correct fsid and the devid that is used -			 * for the dev_replace procedure. -			 * In the second step, the dev_replace state is -			 * read from the device tree and it is known -			 * whether the procedure is really active or -			 * not, which means whether this device is -			 * used or whether it should be removed. -			 */ -			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, -						  &device->dev_state)) { -				continue; -			} -		} +		/* +		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, +		 * in btrfs_init_dev_replace() so just continue. +		 */ +		if (device->devid == BTRFS_DEV_REPLACE_DEVID) +			continue; +  		if (device->bdev) {  			blkdev_put(device->bdev, device->mode);  			device->bdev = NULL; @@ -1080,9 +1077,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,  		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {  			list_del_init(&device->dev_alloc_list);  			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); -			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, -				      &device->dev_state)) -				fs_devices->rw_devices--;  		}  		list_del_init(&device->dev_list);  		fs_devices->num_devices--; @@ -2099,6 +2093,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  	mutex_unlock(&uuid_mutex);  	ret = btrfs_shrink_device(device, 0); +	if (!ret) +		btrfs_reada_remove_dev(device);  	mutex_lock(&uuid_mutex);  	if (ret)  		goto error_undo; @@ -2179,6 +2175,7 @@ out:  	return ret;  error_undo: +	btrfs_reada_undo_remove_dev(device);  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {  		mutex_lock(&fs_info->chunk_mutex);  		list_add(&device->dev_alloc_list, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf27ac07d315..232f02bd214f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -39,10 +39,10 @@ struct btrfs_io_geometry {  #if BITS_PER_LONG==32 && defined(CONFIG_SMP)  #include <linux/seqlock.h>  #define __BTRFS_NEED_DEVICE_DATA_ORDERED -#define btrfs_device_data_ordered_init(device)	\ -	seqcount_init(&device->data_seqcount) +#define btrfs_device_data_ordered_init(device, info)				\ +	seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex)  #else -#define btrfs_device_data_ordered_init(device) do { } while (0) +#define btrfs_device_data_ordered_init(device, info) do { } while (0)  #endif  #define BTRFS_DEV_STATE_WRITEABLE	(0) @@ -50,6 +50,7 @@ struct btrfs_io_geometry {  #define BTRFS_DEV_STATE_MISSING		(2)  #define BTRFS_DEV_STATE_REPLACE_TGT	(3)  #define BTRFS_DEV_STATE_FLUSH_SENT	(4) +#define BTRFS_DEV_STATE_NO_READA	(5)  struct btrfs_device {  	struct list_head dev_list; /* device_list_mutex */ @@ -71,7 +72,8 @@ struct btrfs_device {  	blk_status_t last_flush_error;  #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED -	seqcount_t data_seqcount; +	/* A seqcount_t with associated chunk_mutex (for lockdep) */ +	seqcount_mutex_t data_seqcount;  #endif  	/* the internal btrfs device id */ @@ -162,11 +164,9 @@ btrfs_device_get_##name(const struct btrfs_device *dev)			\  static inline void							\  btrfs_device_set_##name(struct btrfs_device *dev, u64 size)		\  {									\ -	preempt_disable();						\  	write_seqcount_begin(&dev->data_seqcount);			\  	dev->name = size;						\  	write_seqcount_end(&dev->data_seqcount);			\ -	preempt_enable();						\  }  #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)  #define BTRFS_DEVICE_GETSET_FUNCS(name)					\ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3080cda9e824..8bda092e60c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -121,7 +121,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,  		_debug("reissue read");  		ret = bmapping->a_ops->readpage(NULL, backpage);  		if (ret < 0) -			goto unlock_discard; +			goto discard;  	}  	/* but the page may have been read before the monitor was installed, so @@ -138,6 +138,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,  unlock_discard:  	unlock_page(backpage); +discard:  	spin_lock_irq(&object->work_lock);  	list_del(&monitor->op_link);  	spin_unlock_irq(&object->work_lock); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5027bbdca419..ded4229c314a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	     vino.snap, inode);  	mutex_lock(&session->s_mutex); -	session->s_seq++; +	inc_session_sequence(session);  	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,  	     (unsigned)seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 08f1c0c31dc2..8f1d7500a7ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,  	     dname.len, dname.name);  	mutex_lock(&session->s_mutex); -	session->s_seq++; +	inc_session_sequence(session);  	if (!inode) {  		dout("handle_lease no inode %llx\n", vino.ino); @@ -4385,29 +4385,49 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)  bool check_session_state(struct ceph_mds_session *s)  { -	if (s->s_state == CEPH_MDS_SESSION_CLOSING) { -		dout("resending session close request for mds%d\n", -				s->s_mds); -		request_close_session(s); -		return false; -	} -	if (s->s_ttl && time_after(jiffies, s->s_ttl)) { -		if (s->s_state == CEPH_MDS_SESSION_OPEN) { +	switch (s->s_state) { +	case CEPH_MDS_SESSION_OPEN: +		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {  			s->s_state = CEPH_MDS_SESSION_HUNG;  			pr_info("mds%d hung\n", s->s_mds);  		} -	} -	if (s->s_state == CEPH_MDS_SESSION_NEW || -	    s->s_state == CEPH_MDS_SESSION_RESTARTING || -	    s->s_state == CEPH_MDS_SESSION_CLOSED || -	    s->s_state == CEPH_MDS_SESSION_REJECTED) -		/* this mds is failed or recovering, just wait */ +		break; +	case CEPH_MDS_SESSION_CLOSING: +		/* Should never reach this when we're unmounting */ +		WARN_ON_ONCE(true); +		fallthrough; +	case CEPH_MDS_SESSION_NEW: +	case CEPH_MDS_SESSION_RESTARTING: +	case CEPH_MDS_SESSION_CLOSED: +	case CEPH_MDS_SESSION_REJECTED:  		return false; +	}  	return true;  }  /* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ +	lockdep_assert_held(&s->s_mutex); + +	s->s_seq++; + +	if (s->s_state == CEPH_MDS_SESSION_CLOSING) { +		int ret; + +		dout("resending session close request for mds%d\n", s->s_mds); +		ret = request_close_session(s); +		if (ret < 0) +			pr_err("unable to close session to mds%d: %d\n", +			       s->s_mds, ret); +	} +} + +/*   * delayed work -- periodically trim expired leases, renew caps with mds   */  static void schedule_delayed(struct ceph_mds_client *mdsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cbf8af437140..f5adbebcb38e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -480,6 +480,7 @@ struct ceph_mds_client {  extern const char *ceph_mds_op_name(int op);  extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s);  extern struct ceph_mds_session *  __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 83cb4f26b689..9b785f11e95a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,  	/* increment msg sequence number */  	mutex_lock(&session->s_mutex); -	session->s_seq++; +	inc_session_sequence(session);  	mutex_unlock(&session->s_mutex);  	/* lookup inode */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0da39c16dab4..b611f829cb61 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,  	     ceph_snap_op_name(op), split, trace_len);  	mutex_lock(&session->s_mutex); -	session->s_seq++; +	inc_session_sequence(session);  	mutex_unlock(&session->s_mutex);  	down_write(&mdsc->snap_rwsem); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 23b21e943652..ef4784e72b1d 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1266,6 +1266,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,  		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);  	} else if (mode_from_special_sid) {  		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true); +		kfree(pntsd);  	} else {  		/* get approximated mode from ACL */  		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 504766cb6c19..dab94f67c988 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -264,7 +264,7 @@ smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)  }  static struct mid_q_entry * -smb2_find_mid(struct TCP_Server_Info *server, char *buf) +__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)  {  	struct mid_q_entry *mid;  	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; @@ -281,6 +281,10 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)  		    (mid->mid_state == MID_REQUEST_SUBMITTED) &&  		    (mid->command == shdr->Command)) {  			kref_get(&mid->refcount); +			if (dequeue) { +				list_del_init(&mid->qhead); +				mid->mid_flags |= MID_DELETED; +			}  			spin_unlock(&GlobalMid_Lock);  			return mid;  		} @@ -289,6 +293,18 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)  	return NULL;  } +static struct mid_q_entry * +smb2_find_mid(struct TCP_Server_Info *server, char *buf) +{ +	return __smb2_find_mid(server, buf, false); +} + +static struct mid_q_entry * +smb2_find_dequeue_mid(struct TCP_Server_Info *server, char *buf) +{ +	return __smb2_find_mid(server, buf, true); +} +  static void  smb2_dump_detail(void *buf, struct TCP_Server_Info *server)  { @@ -4356,7 +4372,8 @@ init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size,  static int  handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  		 char *buf, unsigned int buf_len, struct page **pages, -		 unsigned int npages, unsigned int page_data_size) +		 unsigned int npages, unsigned int page_data_size, +		 bool is_offloaded)  {  	unsigned int data_offset;  	unsigned int data_len; @@ -4378,7 +4395,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  	if (server->ops->is_session_expired &&  	    server->ops->is_session_expired(buf)) { -		cifs_reconnect(server); +		if (!is_offloaded) +			cifs_reconnect(server);  		return -1;  	} @@ -4402,7 +4420,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  		cifs_dbg(FYI, "%s: server returned error %d\n",  			 __func__, rdata->result);  		/* normal error on read response */ -		dequeue_mid(mid, false); +		if (is_offloaded) +			mid->mid_state = MID_RESPONSE_RECEIVED; +		else +			dequeue_mid(mid, false);  		return 0;  	} @@ -4426,7 +4447,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  		cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",  			 __func__, data_offset);  		rdata->result = -EIO; -		dequeue_mid(mid, rdata->result); +		if (is_offloaded) +			mid->mid_state = MID_RESPONSE_MALFORMED; +		else +			dequeue_mid(mid, rdata->result);  		return 0;  	} @@ -4442,21 +4466,30 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  			cifs_dbg(FYI, "%s: data offset (%u) beyond 1st page of response\n",  				 __func__, data_offset);  			rdata->result = -EIO; -			dequeue_mid(mid, rdata->result); +			if (is_offloaded) +				mid->mid_state = MID_RESPONSE_MALFORMED; +			else +				dequeue_mid(mid, rdata->result);  			return 0;  		}  		if (data_len > page_data_size - pad_len) {  			/* data_len is corrupt -- discard frame */  			rdata->result = -EIO; -			dequeue_mid(mid, rdata->result); +			if (is_offloaded) +				mid->mid_state = MID_RESPONSE_MALFORMED; +			else +				dequeue_mid(mid, rdata->result);  			return 0;  		}  		rdata->result = init_read_bvec(pages, npages, page_data_size,  					       cur_off, &bvec);  		if (rdata->result != 0) { -			dequeue_mid(mid, rdata->result); +			if (is_offloaded) +				mid->mid_state = MID_RESPONSE_MALFORMED; +			else +				dequeue_mid(mid, rdata->result);  			return 0;  		} @@ -4471,7 +4504,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  		/* read response payload cannot be in both buf and pages */  		WARN_ONCE(1, "buf can not contain only a part of read data");  		rdata->result = -EIO; -		dequeue_mid(mid, rdata->result); +		if (is_offloaded) +			mid->mid_state = MID_RESPONSE_MALFORMED; +		else +			dequeue_mid(mid, rdata->result);  		return 0;  	} @@ -4482,7 +4518,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,  	if (length < 0)  		return length; -	dequeue_mid(mid, false); +	if (is_offloaded) +		mid->mid_state = MID_RESPONSE_RECEIVED; +	else +		dequeue_mid(mid, false);  	return length;  } @@ -4511,15 +4550,34 @@ static void smb2_decrypt_offload(struct work_struct *work)  	}  	dw->server->lstrp = jiffies; -	mid = smb2_find_mid(dw->server, dw->buf); +	mid = smb2_find_dequeue_mid(dw->server, dw->buf);  	if (mid == NULL)  		cifs_dbg(FYI, "mid not found\n");  	else {  		mid->decrypted = true;  		rc = handle_read_data(dw->server, mid, dw->buf,  				      dw->server->vals->read_rsp_size, -				      dw->ppages, dw->npages, dw->len); -		mid->callback(mid); +				      dw->ppages, dw->npages, dw->len, +				      true); +		if (rc >= 0) { +#ifdef CONFIG_CIFS_STATS2 +			mid->when_received = jiffies; +#endif +			mid->callback(mid); +		} else { +			spin_lock(&GlobalMid_Lock); +			if (dw->server->tcpStatus == CifsNeedReconnect) { +				mid->mid_state = MID_RETRY_NEEDED; +				spin_unlock(&GlobalMid_Lock); +				mid->callback(mid); +			} else { +				mid->mid_state = MID_REQUEST_SUBMITTED; +				mid->mid_flags &= ~(MID_DELETED); +				list_add_tail(&mid->qhead, +					&dw->server->pending_mid_q); +				spin_unlock(&GlobalMid_Lock); +			} +		}  		cifs_mid_q_entry_release(mid);  	} @@ -4622,7 +4680,7 @@ non_offloaded_decrypt:  		(*mid)->decrypted = true;  		rc = handle_read_data(server, *mid, buf,  				      server->vals->read_rsp_size, -				      pages, npages, len); +				      pages, npages, len, false);  	}  free_pages: @@ -4765,7 +4823,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)  	char *buf = server->large_buf ? server->bigbuf : server->smallbuf;  	return handle_read_data(server, mid, buf, server->pdu_size, -				NULL, 0, 0); +				NULL, 0, 0, false);  }  static int diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 89bffa82ed74..c57bebfa48fe 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -74,7 +74,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)  	int i;  	/* The file must need contents encryption, not filenames encryption */ -	if (!fscrypt_needs_contents_encryption(inode)) +	if (!S_ISREG(inode->i_mode))  		return 0;  	/* The crypto mode must have a blk-crypto counterpart */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d3c3e5d9b41f..d595abb8ef90 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -269,9 +269,7 @@ unlock:  	 * New inodes may not have an inode number assigned yet.  	 * Hashing their inode number is delayed until later.  	 */ -	if (ci->ci_inode->i_ino == 0) -		WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); -	else +	if (ci->ci_inode->i_ino)  		fscrypt_hash_inode_number(ci, mk);  	return 0;  } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index a768a09430c3..686e0ad28788 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1127,24 +1127,23 @@ static const struct file_operations debugfs_devm_entry_ops = {   *	file will be created in the root of the debugfs filesystem.   * @read_fn: function pointer called to print the seq_file content.   */ -struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, -					   struct dentry *parent, -					   int (*read_fn)(struct seq_file *s, -							  void *data)) +void debugfs_create_devm_seqfile(struct device *dev, const char *name, +				 struct dentry *parent, +				 int (*read_fn)(struct seq_file *s, void *data))  {  	struct debugfs_devm_entry *entry;  	if (IS_ERR(parent)) -		return ERR_PTR(-ENOENT); +		return;  	entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL);  	if (!entry) -		return ERR_PTR(-ENOMEM); +		return;  	entry->read = read_fn;  	entry->dev = dev; -	return debugfs_create_file(name, S_IRUGO, parent, entry, -				   &debugfs_devm_entry_ops); +	debugfs_create_file(name, S_IRUGO, parent, entry, +			    &debugfs_devm_entry_ops);  }  EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 96c0c86f3fff..0297ad95eb5c 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -7,6 +7,7 @@  #include <linux/efi.h>  #include <linux/fs.h>  #include <linux/ctype.h> +#include <linux/kmemleak.h>  #include <linux/slab.h>  #include <linux/uuid.h> @@ -103,6 +104,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,  	var->var.VariableName[i] = '\0';  	inode->i_private = var; +	kmemleak_ignore(var);  	err = efivar_entry_add(var, &efivarfs_list);  	if (err) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 139d0bed42f8..3e21c0e8adae 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode,  		i_gid_write(inode, le32_to_cpu(die->i_gid));  		set_nlink(inode, le32_to_cpu(die->i_nlink)); -		/* ns timestamp */ -		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = -			le64_to_cpu(die->i_ctime); -		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = -			le32_to_cpu(die->i_ctime_nsec); +		/* extended inode has its own timestamp */ +		inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime); +		inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);  		inode->i_size = le64_to_cpu(die->i_size); @@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode,  		i_gid_write(inode, le16_to_cpu(dic->i_gid));  		set_nlink(inode, le16_to_cpu(dic->i_nlink)); -		/* use build time to derive all file time */ -		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = -			sbi->build_time; -		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = -			sbi->build_time_nsec; +		/* use build time for compact inodes */ +		inode->i_ctime.tv_sec = sbi->build_time; +		inode->i_ctime.tv_nsec = sbi->build_time_nsec;  		inode->i_size = le32_to_cpu(dic->i_size);  		if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode,  		goto err_out;  	} +	inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; +	inode->i_atime.tv_sec = inode->i_ctime.tv_sec; +	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; +	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; +  	if (!nblks)  		/* measure inode.i_blocks as generic filesystems */  		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50912a5420b4..86fd3bf62af6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1078,8 +1078,11 @@ out_allocpage:  		cond_resched();  		goto repeat;  	} -	set_page_private(page, (unsigned long)pcl); -	SetPagePrivate(page); + +	if (tocache) { +		set_page_private(page, (unsigned long)pcl); +		SetPagePrivate(page); +	}  out:	/* the only exit (for tracing and debugging) */  	return page;  } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5b81f3b080ee..ca50c90adc4c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -669,68 +669,8 @@ const struct file_operations ext4_dir_operations = {  };  #ifdef CONFIG_UNICODE -static int ext4_d_compare(const struct dentry *dentry, unsigned int len, -			  const char *str, const struct qstr *name) -{ -	struct qstr qstr = {.name = str, .len = len }; -	const struct dentry *parent = READ_ONCE(dentry->d_parent); -	const struct inode *inode = d_inode_rcu(parent); -	char strbuf[DNAME_INLINE_LEN]; - -	if (!inode || !IS_CASEFOLDED(inode) || -	    !EXT4_SB(inode->i_sb)->s_encoding) { -		if (len != name->len) -			return -1; -		return memcmp(str, name->name, len); -	} - -	/* -	 * If the dentry name is stored in-line, then it may be concurrently -	 * modified by a rename.  If this happens, the VFS will eventually retry -	 * the lookup, so it doesn't matter what ->d_compare() returns. -	 * However, it's unsafe to call utf8_strncasecmp() with an unstable -	 * string.  Therefore, we have to copy the name into a temporary buffer. -	 */ -	if (len <= DNAME_INLINE_LEN - 1) { -		memcpy(strbuf, str, len); -		strbuf[len] = 0; -		qstr.name = strbuf; -		/* prevent compiler from optimizing out the temporary buffer */ -		barrier(); -	} - -	return ext4_ci_compare(inode, name, &qstr, false); -} - -static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) -{ -	const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); -	const struct unicode_map *um = sbi->s_encoding; -	const struct inode *inode = d_inode_rcu(dentry); -	unsigned char *norm; -	int len, ret = 0; - -	if (!inode || !IS_CASEFOLDED(inode) || !um) -		return 0; - -	norm = kmalloc(PATH_MAX, GFP_ATOMIC); -	if (!norm) -		return -ENOMEM; - -	len = utf8_casefold(um, str, norm, PATH_MAX); -	if (len < 0) { -		if (ext4_has_strict_mode(sbi)) -			ret = -EINVAL; -		goto out; -	} -	str->hash = full_name_hash(dentry, norm, len); -out: -	kfree(norm); -	return ret; -} -  const struct dentry_operations ext4_dentry_ops = { -	.d_hash = ext4_d_hash, -	.d_compare = ext4_d_compare, +	.d_hash = generic_ci_d_hash, +	.d_compare = generic_ci_d_compare,  };  #endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 254d1c26bea8..65ecaf96d0a4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,9 +1028,6 @@ struct ext4_inode_info {  					 * protected by sbi->s_fc_lock.  					 */ -	/* Fast commit subtid when this inode was committed */ -	unsigned int i_fc_committed_subtid; -  	/* Start of lblk range that needs to be committed in this fast commit */  	ext4_lblk_t i_fc_lblk_start; @@ -1166,10 +1163,6 @@ struct ext4_inode_info {  #define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */  #define	EXT4_ERROR_FS			0x0002	/* Errors detected */  #define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */ -#define EXT4_FC_INELIGIBLE		0x0008	/* Fast commit ineligible */ -#define EXT4_FC_COMMITTING		0x0010	/* File system underoing a fast -						 * commit. -						 */  #define EXT4_FC_REPLAY			0x0020	/* Fast commit replay ongoing */  /* @@ -1238,13 +1231,13 @@ struct ext4_inode_info {  						      blocks */  #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated  						      file systems */ -#define EXT4_MOUNT2_DAX_NEVER		0x00000008 /* Do not allow Direct Access */ -#define EXT4_MOUNT2_DAX_INODE		0x00000010 /* For printing options only */ -  #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly  						specified journal checksum */  #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER		0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE		0x00000040 /* For printing options only */ +  #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \  						~EXT4_MOUNT_##opt @@ -1426,12 +1419,6 @@ struct ext4_super_block {  #ifdef __KERNEL__ -/* - * run-time mount flags - */ -#define EXT4_MF_MNTDIR_SAMPLED		0x0001 -#define EXT4_MF_FS_ABORTED		0x0002	/* Fatal error detected */ -  #ifdef CONFIG_FS_ENCRYPTION  #define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)  #else @@ -1444,14 +1431,6 @@ struct ext4_super_block {  #define EXT4_ENC_UTF8_12_1	1  /* - * Flags for ext4_sb_info.s_encoding_flags. - */ -#define EXT4_ENC_STRICT_MODE_FL	(1 << 0) - -#define ext4_has_strict_mode(sbi) \ -	(sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) - -/*   * fourth extended-fs super-block data in memory   */  struct ext4_sb_info { @@ -1474,7 +1453,7 @@ struct ext4_sb_info {  	struct buffer_head * __rcu *s_group_desc;  	unsigned int s_mount_opt;  	unsigned int s_mount_opt2; -	unsigned int s_mount_flags; +	unsigned long s_mount_flags;  	unsigned int s_def_mount_opt;  	ext4_fsblk_t s_sb_block;  	atomic64_t s_resv_clusters; @@ -1500,10 +1479,6 @@ struct ext4_sb_info {  	struct kobject s_kobj;  	struct completion s_kobj_unregister;  	struct super_block *s_sb; -#ifdef CONFIG_UNICODE -	struct unicode_map *s_encoding; -	__u16 s_encoding_flags; -#endif  	/* Journaling */  	struct journal_s *s_journal; @@ -1707,6 +1682,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)  })  /* + * run-time mount flags + */ +enum { +	EXT4_MF_MNTDIR_SAMPLED, +	EXT4_MF_FS_ABORTED,	/* Fatal error detected */ +	EXT4_MF_FC_INELIGIBLE,	/* Fast commit ineligible */ +	EXT4_MF_FC_COMMITTING	/* File system underoing a fast +				 * commit. +				 */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ +	set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ +	clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ +	return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/*   * Simulate_fail codes   */  #define EXT4_SIM_BBITMAP_EIO	1 @@ -1875,6 +1878,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode)  #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010  #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020  #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */  #define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400  #define EXT4_FEATURE_COMPAT_STABLE_INODES	0x0800 @@ -2685,7 +2695,8 @@ void ext4_insert_dentry(struct inode *inode,  			struct ext4_filename *fname);  static inline void ext4_update_dx_flag(struct inode *inode)  { -	if (!ext4_has_feature_dir_index(inode->i_sb)) { +	if (!ext4_has_feature_dir_index(inode->i_sb) && +	    ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {  		/* ext4_iget() should have caught this... */  		WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));  		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); @@ -2743,12 +2754,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);  int ext4_fc_info_show(struct seq_file *seq, void *v);  void ext4_fc_init(struct super_block *sb, journal_t *journal);  void ext4_fc_init_inode(struct inode *inode); -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,  			 ext4_lblk_t end); -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_inode(struct inode *inode); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, +	struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, +	struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode);  void ext4_fc_mark_ineligible(struct super_block *sb, int reason);  void ext4_fc_start_ineligible(struct super_block *sb, int reason);  void ext4_fc_stop_ineligible(struct super_block *sb); @@ -3464,7 +3479,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,  extern int ext4_ci_compare(const struct inode *parent,  			   const struct qstr *fname,  			   const struct qstr *entry, bool quick); -extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,  			 struct inode *inode);  extern int __ext4_link(struct inode *dir, struct inode *inode,  		       struct dentry *dentry); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 559100f3e23c..17d7096b3212 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1471,16 +1471,16 @@ static int ext4_ext_search_left(struct inode *inode,  }  /* - * search the closest allocated block to the right for *logical - * and returns it at @logical + it's physical address at @phys - * if *logical is the largest allocated block, the function - * returns 0 at @phys - * return value contains 0 (success) or error code + * Search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys. + * If not exists, return 0 and @phys is set to 0. We will return + * 1 which means we found an allocated block and ret_ex is valid. + * Or return a (< 0) error code.   */  static int ext4_ext_search_right(struct inode *inode,  				 struct ext4_ext_path *path,  				 ext4_lblk_t *logical, ext4_fsblk_t *phys, -				 struct ext4_extent **ret_ex) +				 struct ext4_extent *ret_ex)  {  	struct buffer_head *bh = NULL;  	struct ext4_extent_header *eh; @@ -1574,10 +1574,11 @@ got_index:  found_extent:  	*logical = le32_to_cpu(ex->ee_block);  	*phys = ext4_ext_pblock(ex); -	*ret_ex = ex; +	if (ret_ex) +		*ret_ex = *ex;  	if (bh)  		put_bh(bh); -	return 0; +	return 1;  }  /* @@ -2868,8 +2869,8 @@ again:  			 */  			lblk = ex_end + 1;  			err = ext4_ext_search_right(inode, path, &lblk, &pblk, -						    &ex); -			if (err) +						    NULL); +			if (err < 0)  				goto out;  			if (pblk) {  				partial.pclu = EXT4_B2C(sbi, pblk); @@ -3723,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,  	err = ext4_ext_dirty(handle, inode, path + path->p_depth);  out:  	ext4_ext_show_leaf(inode, path); -	ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);  	return err;  } @@ -3795,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,  	if (*allocated > map->m_len)  		*allocated = map->m_len;  	map->m_len = *allocated; -	ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);  	return 0;  } @@ -4039,7 +4038,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  			struct ext4_map_blocks *map, int flags)  {  	struct ext4_ext_path *path = NULL; -	struct ext4_extent newex, *ex, *ex2; +	struct ext4_extent newex, *ex, ex2;  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	ext4_fsblk_t newblock = 0, pblk;  	int err = 0, depth, ret; @@ -4175,15 +4174,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	if (err)  		goto out;  	ar.lright = map->m_lblk; -	ex2 = NULL;  	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); -	if (err) +	if (err < 0)  		goto out;  	/* Check if the extent after searching to the right implies a  	 * cluster we can use. */ -	if ((sbi->s_cluster_ratio > 1) && ex2 && -	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { +	if ((sbi->s_cluster_ratio > 1) && err && +	    get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {  		ar.len = allocated = map->m_len;  		newblock = map->m_pblk;  		goto got_allocated_blocks; @@ -4329,7 +4327,6 @@ got_allocated_blocks:  	map->m_len = ar.len;  	allocated = map->m_len;  	ext4_ext_show_leaf(inode, path); -	ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1);  out:  	ext4_ext_drop_refs(path);  	kfree(path); @@ -4602,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,  	ret = ext4_mark_inode_dirty(handle, inode);  	if (unlikely(ret))  		goto out_handle; -	ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits, +	ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,  			(offset + len - 1) >> inode->i_sb->s_blocksize_bits);  	/* Zero out partial block at the edges of the range */  	ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)  		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |  		     FALLOC_FL_INSERT_RANGE))  		return -EOPNOTSUPP; -	ext4_fc_track_range(inode, offset >> blkbits, -			(offset + len - 1) >> blkbits);  	ext4_fc_start_update(inode); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 447c8d93f480..f2033e13a273 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -83,7 +83,7 @@   *   * Atomicity of commits   * -------------------- - * In order to gaurantee atomicity during the commit operation, fast commit + * In order to guarantee atomicity during the commit operation, fast commit   * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail   * tag contains CRC of the contents and TID of the transaction after which   * this fast commit should be applied. Recovery code replays fast commit @@ -152,7 +152,31 @@ void ext4_fc_init_inode(struct inode *inode)  	INIT_LIST_HEAD(&ei->i_fc_list);  	init_waitqueue_head(&ei->i_fc_wait);  	atomic_set(&ei->i_fc_updates, 0); -	ei->i_fc_committed_subtid = 0; +} + +/* This function must be called with sbi->s_fc_lock held. */ +static void ext4_fc_wait_committing_inode(struct inode *inode) +__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) +{ +	wait_queue_head_t *wq; +	struct ext4_inode_info *ei = EXT4_I(inode); + +#if (BITS_PER_LONG < 64) +	DEFINE_WAIT_BIT(wait, &ei->i_state_flags, +			EXT4_STATE_FC_COMMITTING); +	wq = bit_waitqueue(&ei->i_state_flags, +				EXT4_STATE_FC_COMMITTING); +#else +	DEFINE_WAIT_BIT(wait, &ei->i_flags, +			EXT4_STATE_FC_COMMITTING); +	wq = bit_waitqueue(&ei->i_flags, +				EXT4_STATE_FC_COMMITTING); +#endif +	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); +	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); +	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); +	schedule(); +	finish_wait(wq, &wait.wq_entry);  }  /* @@ -176,22 +200,7 @@ restart:  		goto out;  	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { -		wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags, -				EXT4_STATE_FC_COMMITTING); -		wq = bit_waitqueue(&ei->i_state_flags, -				   EXT4_STATE_FC_COMMITTING); -#else -		DEFINE_WAIT_BIT(wait, &ei->i_flags, -				EXT4_STATE_FC_COMMITTING); -		wq = bit_waitqueue(&ei->i_flags, -				   EXT4_STATE_FC_COMMITTING); -#endif -		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); -		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -		schedule(); -		finish_wait(wq, &wait.wq_entry); +		ext4_fc_wait_committing_inode(inode);  		goto restart;  	}  out: @@ -234,26 +243,10 @@ restart:  	}  	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { -		wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags, -				EXT4_STATE_FC_COMMITTING); -		wq = bit_waitqueue(&ei->i_state_flags, -				   EXT4_STATE_FC_COMMITTING); -#else -		DEFINE_WAIT_BIT(wait, &ei->i_flags, -				EXT4_STATE_FC_COMMITTING); -		wq = bit_waitqueue(&ei->i_flags, -				   EXT4_STATE_FC_COMMITTING); -#endif -		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); -		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -		schedule(); -		finish_wait(wq, &wait.wq_entry); +		ext4_fc_wait_committing_inode(inode);  		goto restart;  	} -	if (!list_empty(&ei->i_fc_list)) -		list_del_init(&ei->i_fc_list); +	list_del_init(&ei->i_fc_list);  	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);  } @@ -269,7 +262,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)  	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))  		return; -	sbi->s_mount_state |= EXT4_FC_INELIGIBLE; +	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);  	WARN_ON(reason >= EXT4_FC_REASON_MAX);  	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;  } @@ -292,7 +285,7 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)  }  /* - * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here + * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here   * to ensure that after stopping the ineligible update, at least one full   * commit takes place.   */ @@ -302,14 +295,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb)  	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))  		return; -	EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; +	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);  	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);  }  static inline int ext4_fc_is_ineligible(struct super_block *sb)  { -	return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) || -		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); +	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || +		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));  }  /* @@ -323,13 +316,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb)   * If enqueue is set, this function enqueues the inode in fast commit list.   */  static int ext4_fc_track_template( -	struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), +	handle_t *handle, struct inode *inode, +	int (*__fc_track_fn)(struct inode *, void *, bool),  	void *args, int enqueue)  { -	tid_t running_txn_tid;  	bool update = false;  	struct ext4_inode_info *ei = EXT4_I(inode);  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	tid_t tid = 0;  	int ret;  	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || @@ -339,15 +333,13 @@ static int ext4_fc_track_template(  	if (ext4_fc_is_ineligible(inode->i_sb))  		return -EINVAL; -	running_txn_tid = sbi->s_journal ? -		sbi->s_journal->j_commit_sequence + 1 : 0; - +	tid = handle->h_transaction->t_tid;  	mutex_lock(&ei->i_fc_lock); -	if (running_txn_tid == ei->i_sync_tid) { +	if (tid == ei->i_sync_tid) {  		update = true;  	} else {  		ext4_fc_reset_inode(inode); -		ei->i_sync_tid = running_txn_tid; +		ei->i_sync_tid = tid;  	}  	ret = __fc_track_fn(inode, args, update);  	mutex_unlock(&ei->i_fc_lock); @@ -358,7 +350,7 @@ static int ext4_fc_track_template(  	spin_lock(&sbi->s_fc_lock);  	if (list_empty(&EXT4_I(inode)->i_fc_list))  		list_add_tail(&EXT4_I(inode)->i_fc_list, -				(sbi->s_mount_state & EXT4_FC_COMMITTING) ? +				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?  				&sbi->s_fc_q[FC_Q_STAGING] :  				&sbi->s_fc_q[FC_Q_MAIN]);  	spin_unlock(&sbi->s_fc_lock); @@ -384,7 +376,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)  	mutex_unlock(&ei->i_fc_lock);  	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);  	if (!node) { -		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); +		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);  		mutex_lock(&ei->i_fc_lock);  		return -ENOMEM;  	} @@ -397,7 +389,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)  		if (!node->fcd_name.name) {  			kmem_cache_free(ext4_fc_dentry_cachep, node);  			ext4_fc_mark_ineligible(inode->i_sb, -				EXT4_FC_REASON_MEM); +				EXT4_FC_REASON_NOMEM);  			mutex_lock(&ei->i_fc_lock);  			return -ENOMEM;  		} @@ -411,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)  	node->fcd_name.len = dentry->d_name.len;  	spin_lock(&sbi->s_fc_lock); -	if (sbi->s_mount_state & EXT4_FC_COMMITTING) +	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))  		list_add_tail(&node->fcd_list,  				&sbi->s_fc_dentry_q[FC_Q_STAGING]);  	else @@ -422,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)  	return 0;  } -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) +void __ext4_fc_track_unlink(handle_t *handle, +		struct inode *inode, struct dentry *dentry)  {  	struct __track_dentry_update_args args;  	int ret; @@ -430,12 +423,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)  	args.dentry = dentry;  	args.op = EXT4_FC_TAG_UNLINK; -	ret = ext4_fc_track_template(inode, __track_dentry_update, +	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,  					(void *)&args, 0);  	trace_ext4_fc_track_unlink(inode, dentry, ret);  } -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) +{ +	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry); +} + +void __ext4_fc_track_link(handle_t *handle, +	struct inode *inode, struct dentry *dentry)  {  	struct __track_dentry_update_args args;  	int ret; @@ -443,20 +442,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)  	args.dentry = dentry;  	args.op = EXT4_FC_TAG_LINK; -	ret = ext4_fc_track_template(inode, __track_dentry_update, +	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,  					(void *)&args, 0);  	trace_ext4_fc_track_link(inode, dentry, ret);  } -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +{ +	__ext4_fc_track_link(handle, d_inode(dentry), dentry); +} + +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)  {  	struct __track_dentry_update_args args; +	struct inode *inode = d_inode(dentry);  	int ret;  	args.dentry = dentry;  	args.op = EXT4_FC_TAG_CREAT; -	ret = ext4_fc_track_template(inode, __track_dentry_update, +	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,  					(void *)&args, 0);  	trace_ext4_fc_track_create(inode, dentry, ret);  } @@ -472,14 +477,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)  	return 0;  } -void ext4_fc_track_inode(struct inode *inode) +void ext4_fc_track_inode(handle_t *handle, struct inode *inode)  {  	int ret;  	if (S_ISDIR(inode->i_mode))  		return; -	ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); +	if (ext4_should_journal_data(inode)) { +		ext4_fc_mark_ineligible(inode->i_sb, +					EXT4_FC_REASON_INODE_JOURNAL_DATA); +		return; +	} + +	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);  	trace_ext4_fc_track_inode(inode, ret);  } @@ -515,7 +526,7 @@ static int __track_range(struct inode *inode, void *arg, bool update)  	return 0;  } -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,  			 ext4_lblk_t end)  {  	struct __track_range_args args; @@ -527,7 +538,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,  	args.start = start;  	args.end = end; -	ret = ext4_fc_track_template(inode,  __track_range, &args, 1); +	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);  	trace_ext4_fc_track_range(inode, start, end, ret);  } @@ -537,10 +548,11 @@ static void ext4_fc_submit_bh(struct super_block *sb)  	int write_flags = REQ_SYNC;  	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; +	/* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */  	if (test_opt(sb, BARRIER))  		write_flags |= REQ_FUA | REQ_PREFLUSH;  	lock_buffer(bh); -	clear_buffer_dirty(bh); +	set_buffer_dirty(bh);  	set_buffer_uptodate(bh);  	bh->b_end_io = ext4_end_buffer_io_sync;  	submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -846,7 +858,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)  	int ret = 0;  	spin_lock(&sbi->s_fc_lock); -	sbi->s_mount_state |= EXT4_FC_COMMITTING; +	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);  	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {  		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);  		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); @@ -900,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)  /* Commit all the directory entry updates */  static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +__acquires(&sbi->s_fc_lock) +__releases(&sbi->s_fc_lock)  {  	struct super_block *sb = (struct super_block *)(journal->j_private);  	struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -964,7 +978,6 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)  			fc_dentry->fcd_parent, fc_dentry->fcd_ino,  			fc_dentry->fcd_name.len,  			fc_dentry->fcd_name.name, crc)) { -			spin_lock(&sbi->s_fc_lock);  			ret = -ENOSPC;  			goto lock_and_exit;  		} @@ -997,6 +1010,13 @@ static int ext4_fc_perform_commit(journal_t *journal)  	if (ret)  		return ret; +	/* +	 * If file system device is different from journal device, issue a cache +	 * flush before we start writing fast commit blocks. +	 */ +	if (journal->j_fs_dev != journal->j_dev) +		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); +  	blk_start_plug(&plug);  	if (sbi->s_fc_bytes == 0) {  		/* @@ -1032,8 +1052,6 @@ static int ext4_fc_perform_commit(journal_t *journal)  		if (ret)  			goto out;  		spin_lock(&sbi->s_fc_lock); -		EXT4_I(inode)->i_fc_committed_subtid = -			atomic_read(&sbi->s_fc_subtid);  	}  	spin_unlock(&sbi->s_fc_lock); @@ -1132,7 +1150,7 @@ out:  		"Fast commit ended with blks = %d, reason = %d, subtid - %d",  		nblks, reason, subtid);  	if (reason == EXT4_FC_REASON_FC_FAILED) -		return jbd2_fc_end_commit_fallback(journal, commit_tid); +		return jbd2_fc_end_commit_fallback(journal);  	if (reason == EXT4_FC_REASON_FC_START_FAILED ||  		reason == EXT4_FC_REASON_INELIGIBLE)  		return jbd2_complete_transaction(journal, commit_tid); @@ -1191,8 +1209,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)  	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],  				&sbi->s_fc_q[FC_Q_STAGING]); -	sbi->s_mount_state &= ~EXT4_FC_COMMITTING; -	sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; +	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); +	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);  	if (full)  		sbi->s_fc_bytes = 0; @@ -1264,7 +1282,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)  		return 0;  	} -	ret = __ext4_unlink(old_parent, &entry, inode); +	ret = __ext4_unlink(NULL, old_parent, &entry, inode);  	/* -ENOENT ok coz it might not exist anymore. */  	if (ret == -ENOENT)  		ret = 0; @@ -1617,8 +1635,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb,  		if (ret == 0) {  			/* Range is not mapped */  			path = ext4_find_extent(inode, cur, NULL, 0); -			if (!path) -				continue; +			if (IS_ERR(path)) { +				iput(inode); +				return 0; +			}  			memset(&newex, 0, sizeof(newex));  			newex.ee_block = cpu_to_le32(cur);  			ext4_ext_store_pblock( @@ -2087,13 +2107,9 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)  	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))  		return;  	journal->j_fc_cleanup_callback = ext4_fc_cleanup; -	if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) { -		pr_warn("Error while enabling fast commits, turning off."); -		ext4_clear_feature_fast_commit(sb); -	}  } -const char *fc_ineligible_reasons[] = { +static const char *fc_ineligible_reasons[] = {  	"Extended attributes changed",  	"Cross rename",  	"Journal flag changed", @@ -2102,6 +2118,7 @@ const char *fc_ineligible_reasons[] = {  	"Resize",  	"Dir renamed",  	"Falloc range op", +	"Data journalling",  	"FC Commit Failed"  }; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 06907d485989..3a6e5a1fa1b8 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,9 +3,6 @@  #ifndef __FAST_COMMIT_H__  #define __FAST_COMMIT_H__ -/* Number of blocks in journal area to allocate for fast commits */ -#define EXT4_NUM_FC_BLKS		256 -  /* Fast commit tags */  #define EXT4_FC_TAG_ADD_RANGE		0x0001  #define EXT4_FC_TAG_DEL_RANGE		0x0002 @@ -100,11 +97,12 @@ enum {  	EXT4_FC_REASON_XATTR = 0,  	EXT4_FC_REASON_CROSS_RENAME,  	EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, -	EXT4_FC_REASON_MEM, +	EXT4_FC_REASON_NOMEM,  	EXT4_FC_REASON_SWAP_BOOT,  	EXT4_FC_REASON_RESIZE,  	EXT4_FC_REASON_RENAME_DIR,  	EXT4_FC_REASON_FALLOC_RANGE, +	EXT4_FC_REASON_INODE_JOURNAL_DATA,  	EXT4_FC_COMMIT_FAILED,  	EXT4_FC_REASON_MAX  }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d85412d12e3a..3ed8c048fb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -761,7 +761,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)  	if (!daxdev_mapping_supported(vma, dax_dev))  		return -EOPNOTSUPP; -	ext4_fc_start_update(inode);  	file_accessed(file);  	if (IS_DAX(file_inode(file))) {  		vma->vm_ops = &ext4_dax_vm_ops; @@ -769,7 +768,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)  	} else {  		vma->vm_ops = &ext4_file_vm_ops;  	} -	ext4_fc_stop_update(inode);  	return 0;  } @@ -782,13 +780,13 @@ static int ext4_sample_last_mounted(struct super_block *sb,  	handle_t *handle;  	int err; -	if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) +	if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))  		return 0;  	if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))  		return 0; -	sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; +	ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);  	/*  	 * Sample where the filesystem has been mounted and  	 * store it in the superblock for sysadmin convenience diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b232c2767534..4c2a9fe30067 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,  	/* Fabricate an rmap entry for the external log device. */  	irec.fmr_physical = journal->j_blk_offset; -	irec.fmr_length = journal->j_maxlen; +	irec.fmr_length = journal->j_total_len;  	irec.fmr_owner = EXT4_FMR_OWN_LOG;  	irec.fmr_flags = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 81a545fd14a3..a42ca95840f2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	if (sb_rdonly(inode->i_sb)) {  		/* Make sure that we read updated s_mount_flags value */  		smp_rmb(); -		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) +		if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))  			ret = -EROFS;  		goto out;  	} diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 2924261226e0..a92eb79de0cc 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -275,7 +275,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,  		   struct dx_hash_info *hinfo)  {  #ifdef CONFIG_UNICODE -	const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding; +	const struct unicode_map *um = dir->i_sb->s_encoding;  	int r, dlen;  	unsigned char *buff;  	struct qstr qstr = {.name = name, .len = len }; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index caa51473207d..b41512d1badc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1880,6 +1880,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)  	ext4_write_lock_xattr(inode, &no_expand);  	if (!ext4_has_inline_data(inode)) { +		ext4_write_unlock_xattr(inode, &no_expand);  		*has_inline = 0;  		ext4_journal_stop(handle);  		return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03c2253005f0..0d8385aea898 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -327,6 +327,8 @@ stop_handle:  	ext4_xattr_inode_array_free(ea_inode_array);  	return;  no_delete: +	if (!list_empty(&EXT4_I(inode)->i_fc_list)) +		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);  	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */  } @@ -730,7 +732,7 @@ out_sem:  			if (ret)  				return ret;  		} -		ext4_fc_track_range(inode, map->m_lblk, +		ext4_fc_track_range(handle, inode, map->m_lblk,  			    map->m_lblk + map->m_len - 1);  	} @@ -1918,7 +1920,7 @@ static int __ext4_journalled_writepage(struct page *page,  	}  	if (ret == 0)  		ret = err; -	err = ext4_jbd2_inode_add_write(handle, inode, 0, len); +	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);  	if (ret == 0)  		ret = err;  	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -2440,7 +2442,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,  			struct super_block *sb = inode->i_sb;  			if (ext4_forced_shutdown(EXT4_SB(sb)) || -			    EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) +			    ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))  				goto invalidate_dirty_pages;  			/*  			 * Let the uper layers retry transient errors. @@ -2674,7 +2676,7 @@ static int ext4_writepages(struct address_space *mapping,  	 * the stack trace.  	 */  	if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || -		     sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { +		     ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {  		ret = -EROFS;  		goto out_writepages;  	} @@ -3307,10 +3309,11 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)  	if (journal) {  		if (jbd2_transaction_committed(journal, -					EXT4_I(inode)->i_datasync_tid)) -			return true; -		return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >= -			EXT4_I(inode)->i_fc_committed_subtid; +			EXT4_I(inode)->i_datasync_tid)) +			return false; +		if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) +			return !list_empty(&EXT4_I(inode)->i_fc_list); +		return true;  	}  	/* Any metadata buffers to write? */ @@ -4107,7 +4110,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)  		up_write(&EXT4_I(inode)->i_data_sem);  	} -	ext4_fc_track_range(inode, first_block, stop_block); +	ext4_fc_track_range(handle, inode, first_block, stop_block);  	if (IS_SYNC(inode))  		ext4_handle_sync(handle); @@ -5440,14 +5443,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  			}  			if (shrink) -				ext4_fc_track_range(inode, +				ext4_fc_track_range(handle, inode,  					(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>  					inode->i_sb->s_blocksize_bits,  					(oldsize > 0 ? oldsize - 1 : 0) >>  					inode->i_sb->s_blocksize_bits);  			else  				ext4_fc_track_range( -					inode, +					handle, inode,  					(oldsize > 0 ? oldsize - 1 : oldsize) >>  					inode->i_sb->s_blocksize_bits,  					(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> @@ -5697,7 +5700,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,  		put_bh(iloc->bh);  		return -EIO;  	} -	ext4_fc_track_inode(inode); +	ext4_fc_track_inode(handle, inode);  	if (IS_I_VERSION(inode))  		inode_inc_iversion(inode); @@ -6157,7 +6160,8 @@ retry_alloc:  			if (ext4_walk_page_buffers(handle, page_buffers(page),  					0, len, NULL, write_end_fn))  				goto out_error; -			if (ext4_jbd2_inode_add_write(handle, inode, 0, len)) +			if (ext4_jbd2_inode_add_write(handle, inode, +						      page_offset(page), len))  				goto out_error;  			ext4_set_inode_state(inode, EXT4_STATE_JDATA);  		} else { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 85abbfb98cbe..24af9ed5c3e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)  {  	ext4_group_t i, ngroups; -	if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) +	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))  		return;  	ngroups = ext4_get_groups_count(sb); @@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)  {  	struct super_block *sb = ac->ac_sb; -	if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) +	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))  		return;  	mb_debug(sb, "Can't allocate:" @@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,  	struct super_block *sb = ar->inode->i_sb;  	ext4_group_t group;  	ext4_grpblk_t blkoff; -	int  i; +	int i = sb->s_blocksize;  	ext4_fsblk_t goal, block;  	struct ext4_super_block *es = EXT4_SB(sb)->s_es; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5159830dacb8..33509266f5a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1285,8 +1285,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)  int ext4_ci_compare(const struct inode *parent, const struct qstr *name,  		    const struct qstr *entry, bool quick)  { -	const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb); -	const struct unicode_map *um = sbi->s_encoding; +	const struct super_block *sb = parent->i_sb; +	const struct unicode_map *um = sb->s_encoding;  	int ret;  	if (quick) @@ -1298,7 +1298,7 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,  		/* Handle invalid character sequence as either an error  		 * or as an opaque byte sequence.  		 */ -		if (ext4_has_strict_mode(sbi)) +		if (sb_has_strict_encoding(sb))  			return -EINVAL;  		if (name->len != entry->len) @@ -1315,7 +1315,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,  {  	int len; -	if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) { +	if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {  		cf_name->name = NULL;  		return;  	} @@ -1324,7 +1324,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,  	if (!cf_name->name)  		return; -	len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding, +	len = utf8_casefold(dir->i_sb->s_encoding,  			    iname, cf_name->name,  			    EXT4_NAME_LEN);  	if (len <= 0) { @@ -1361,7 +1361,7 @@ static inline bool ext4_match(const struct inode *parent,  #endif  #ifdef CONFIG_UNICODE -	if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) { +	if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) {  		if (fname->cf_name.name) {  			struct qstr cf = {.name = fname->cf_name.name,  					  .len = fname->cf_name.len}; @@ -2180,9 +2180,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  	struct buffer_head *bh = NULL;  	struct ext4_dir_entry_2 *de;  	struct super_block *sb; -#ifdef CONFIG_UNICODE -	struct ext4_sb_info *sbi; -#endif  	struct ext4_filename fname;  	int	retval;  	int	dx_fallback=0; @@ -2199,9 +2196,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,  		return -EINVAL;  #ifdef CONFIG_UNICODE -	sbi = EXT4_SB(sb); -	if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && -	    sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) +	if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && +	    sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))  		return -EINVAL;  #endif @@ -2610,7 +2606,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,  		       bool excl)  {  	handle_t *handle; -	struct inode *inode, *inode_save; +	struct inode *inode;  	int err, credits, retries = 0;  	err = dquot_initialize(dir); @@ -2628,11 +2624,9 @@ retry:  		inode->i_op = &ext4_file_inode_operations;  		inode->i_fop = &ext4_file_operations;  		ext4_set_aops(inode); -		inode_save = inode; -		ihold(inode_save);  		err = ext4_add_nondir(handle, dentry, &inode); -		ext4_fc_track_create(inode_save, dentry); -		iput(inode_save); +		if (!err) +			ext4_fc_track_create(handle, dentry);  	}  	if (handle)  		ext4_journal_stop(handle); @@ -2647,7 +2641,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,  		      umode_t mode, dev_t rdev)  {  	handle_t *handle; -	struct inode *inode, *inode_save; +	struct inode *inode;  	int err, credits, retries = 0;  	err = dquot_initialize(dir); @@ -2664,12 +2658,9 @@ retry:  	if (!IS_ERR(inode)) {  		init_special_inode(inode, inode->i_mode, rdev);  		inode->i_op = &ext4_special_inode_operations; -		inode_save = inode; -		ihold(inode_save);  		err = ext4_add_nondir(handle, dentry, &inode);  		if (!err) -			ext4_fc_track_create(inode_save, dentry); -		iput(inode_save); +			ext4_fc_track_create(handle, dentry);  	}  	if (handle)  		ext4_journal_stop(handle); @@ -2833,7 +2824,6 @@ out_clear_inode:  		iput(inode);  		goto out_retry;  	} -	ext4_fc_track_create(inode, dentry);  	ext4_inc_count(dir);  	ext4_update_dx_flag(dir); @@ -2841,6 +2831,7 @@ out_clear_inode:  	if (err)  		goto out_clear_inode;  	d_instantiate_new(dentry, inode); +	ext4_fc_track_create(handle, dentry);  	if (IS_DIRSYNC(dir))  		ext4_handle_sync(handle); @@ -3175,7 +3166,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)  		goto end_rmdir;  	ext4_dec_count(dir);  	ext4_update_dx_flag(dir); -	ext4_fc_track_unlink(inode, dentry); +	ext4_fc_track_unlink(handle, dentry);  	retval = ext4_mark_inode_dirty(handle, dir);  #ifdef CONFIG_UNICODE @@ -3196,13 +3187,12 @@ end_rmdir:  	return retval;  } -int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,  		  struct inode *inode)  {  	int retval = -ENOENT;  	struct buffer_head *bh;  	struct ext4_dir_entry_2 *de; -	handle_t *handle = NULL;  	int skip_remove_dentry = 0;  	bh = ext4_find_entry(dir, d_name, &de, NULL); @@ -3221,14 +3211,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,  		if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)  			skip_remove_dentry = 1;  		else -			goto out_bh; -	} - -	handle = ext4_journal_start(dir, EXT4_HT_DIR, -				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); -	if (IS_ERR(handle)) { -		retval = PTR_ERR(handle); -		goto out_bh; +			goto out;  	}  	if (IS_DIRSYNC(dir)) @@ -3237,12 +3220,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,  	if (!skip_remove_dentry) {  		retval = ext4_delete_entry(handle, dir, de, bh);  		if (retval) -			goto out_handle; +			goto out;  		dir->i_ctime = dir->i_mtime = current_time(dir);  		ext4_update_dx_flag(dir);  		retval = ext4_mark_inode_dirty(handle, dir);  		if (retval) -			goto out_handle; +			goto out;  	} else {  		retval = 0;  	} @@ -3256,15 +3239,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,  	inode->i_ctime = current_time(inode);  	retval = ext4_mark_inode_dirty(handle, inode); -out_handle: -	ext4_journal_stop(handle); -out_bh: +out:  	brelse(bh);  	return retval;  }  static int ext4_unlink(struct inode *dir, struct dentry *dentry)  { +	handle_t *handle;  	int retval;  	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3282,9 +3264,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	if (retval)  		goto out_trace; -	retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry)); +	handle = ext4_journal_start(dir, EXT4_HT_DIR, +				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); +	if (IS_ERR(handle)) { +		retval = PTR_ERR(handle); +		goto out_trace; +	} + +	retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));  	if (!retval) -		ext4_fc_track_unlink(d_inode(dentry), dentry); +		ext4_fc_track_unlink(handle, dentry);  #ifdef CONFIG_UNICODE  	/* VFS negative dentries are incompatible with Encoding and  	 * Case-insensitiveness. Eventually we'll want avoid @@ -3295,6 +3284,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)  	if (IS_CASEFOLDED(dir))  		d_invalidate(dentry);  #endif +	if (handle) +		ext4_journal_stop(handle);  out_trace:  	trace_ext4_unlink_exit(dentry, retval); @@ -3451,7 +3442,6 @@ retry:  	err = ext4_add_entry(handle, dentry, inode);  	if (!err) { -		ext4_fc_track_link(inode, dentry);  		err = ext4_mark_inode_dirty(handle, inode);  		/* this can happen only for tmpfile being  		 * linked the first time @@ -3459,6 +3449,7 @@ retry:  		if (inode->i_nlink == 1)  			ext4_orphan_del(handle, inode);  		d_instantiate(dentry, inode); +		ext4_fc_track_link(handle, dentry);  	} else {  		drop_nlink(inode);  		iput(inode); @@ -3919,9 +3910,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,  			EXT4_FC_REASON_RENAME_DIR);  	} else {  		if (new.inode) -			ext4_fc_track_unlink(new.inode, new.dentry); -		ext4_fc_track_link(old.inode, new.dentry); -		ext4_fc_track_unlink(old.inode, old.dentry); +			ext4_fc_track_unlink(handle, new.dentry); +		__ext4_fc_track_link(handle, old.inode, new.dentry); +		__ext4_fc_track_unlink(handle, old.inode, old.dentry);  	}  	if (new.inode) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2fe141ff3c7e..94472044f4c1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -289,18 +289,7 @@ void ext4_superblock_csum_set(struct super_block *sb)  	if (!ext4_has_metadata_csum(sb))  		return; -	/* -	 * Locking the superblock prevents the scenario -	 * where: -	 *  1) a first thread pauses during checksum calculation. -	 *  2) a second thread updates the superblock, recalculates -	 *     the checksum, and updates s_checksum -	 *  3) the first thread resumes and finishes its checksum calculation -	 *     and updates s_checksum with a potentially stale or torn value. -	 */ -	lock_buffer(EXT4_SB(sb)->s_sbh);  	es->s_checksum = ext4_superblock_csum(sb, es); -	unlock_buffer(EXT4_SB(sb)->s_sbh);  }  ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -686,7 +675,7 @@ static void ext4_handle_error(struct super_block *sb)  	if (!test_opt(sb, ERRORS_CONT)) {  		journal_t *journal = EXT4_SB(sb)->s_journal; -		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; +		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);  		if (journal)  			jbd2_journal_abort(journal, -EIO);  	} @@ -904,7 +893,7 @@ void __ext4_abort(struct super_block *sb, const char *function,  	va_end(args);  	if (sb_rdonly(sb) == 0) { -		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; +		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);  		if (EXT4_SB(sb)->s_journal)  			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); @@ -1288,7 +1277,7 @@ static void ext4_put_super(struct super_block *sb)  	fs_put_dax(sbi->s_daxdev);  	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);  #ifdef CONFIG_UNICODE -	utf8_unload(sbi->s_encoding); +	utf8_unload(sb->s_encoding);  #endif  	kfree(sbi);  } @@ -1716,11 +1705,10 @@ enum {  	Opt_dioread_nolock, Opt_dioread_lock,  	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,  	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, -	Opt_prefetch_block_bitmaps, Opt_no_fc, +	Opt_prefetch_block_bitmaps,  #ifdef CONFIG_EXT4_DEBUG -	Opt_fc_debug_max_replay, +	Opt_fc_debug_max_replay, Opt_fc_debug_force  #endif -	Opt_fc_debug_force  };  static const match_table_t tokens = { @@ -1807,9 +1795,8 @@ static const match_table_t tokens = {  	{Opt_init_itable, "init_itable=%u"},  	{Opt_init_itable, "init_itable"},  	{Opt_noinit_itable, "noinit_itable"}, -	{Opt_no_fc, "no_fc"}, -	{Opt_fc_debug_force, "fc_debug_force"},  #ifdef CONFIG_EXT4_DEBUG +	{Opt_fc_debug_force, "fc_debug_force"},  	{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},  #endif  	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, @@ -2027,8 +2014,8 @@ static const struct mount_opts {  	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |  		       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),  							MOPT_CLEAR | MOPT_Q}, -	{Opt_usrjquota, 0, MOPT_Q}, -	{Opt_grpjquota, 0, MOPT_Q}, +	{Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, +	{Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},  	{Opt_offusrjquota, 0, MOPT_Q},  	{Opt_offgrpjquota, 0, MOPT_Q},  	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, @@ -2039,11 +2026,9 @@ static const struct mount_opts {  	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},  	{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,  	 MOPT_SET}, -	{Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, -	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, +#ifdef CONFIG_EXT4_DEBUG  	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,  	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, -#ifdef CONFIG_EXT4_DEBUG  	{Opt_fc_debug_max_replay, 0, MOPT_GTE0},  #endif  	{Opt_err, 0, 0} @@ -2153,7 +2138,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,  		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);  		return 1;  	case Opt_abort: -		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; +		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);  		return 1;  	case Opt_i_version:  		sb->s_flags |= SB_I_VERSION; @@ -2653,10 +2638,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,  	} else if (test_opt2(sb, DAX_INODE)) {  		SEQ_OPTS_PUTS("dax=inode");  	} - -	if (test_opt2(sb, JOURNAL_FAST_COMMIT)) -		SEQ_OPTS_PUTS("fast_commit"); -  	ext4_show_quota_options(seq, sb);  	return 0;  } @@ -3976,7 +3957,7 @@ int ext4_calculate_overhead(struct super_block *sb)  	 * loaded or not  	 */  	if (sbi->s_journal && !sbi->s_journal_bdev) -		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); +		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);  	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {  		/* j_inum for internal journal is non-zero */  		j_inode = ext4_get_journal_inode(sb, j_inum); @@ -4303,7 +4284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		goto failed_mount;  #ifdef CONFIG_UNICODE -	if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) { +	if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {  		const struct ext4_sb_encodings *encoding_info;  		struct unicode_map *encoding;  		__u16 encoding_flags; @@ -4334,15 +4315,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  			 "%s-%s with flags 0x%hx", encoding_info->name,  			 encoding_info->version?:"\b", encoding_flags); -		sbi->s_encoding = encoding; -		sbi->s_encoding_flags = encoding_flags; +		sb->s_encoding = encoding; +		sb->s_encoding_flags = encoding_flags;  	}  #endif  	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { -		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); +		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");  		/* can't mount with both data=journal and dioread_nolock. */  		clear_opt(sb, DIOREAD_NOLOCK); +		clear_opt2(sb, JOURNAL_FAST_COMMIT);  		if (test_opt2(sb, EXPLICIT_DELALLOC)) {  			ext4_msg(sb, KERN_ERR, "can't mount with "  				 "both data=journal and delalloc"); @@ -4777,8 +4759,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);  	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);  	sbi->s_fc_bytes = 0; -	sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; -	sbi->s_mount_state &= ~EXT4_FC_COMMITTING; +	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); +	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);  	spin_lock_init(&sbi->s_fc_lock);  	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));  	sbi->s_fc_replay_state.fc_regions = NULL; @@ -4857,6 +4839,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  		goto failed_mount_wq;  	} +	if (test_opt2(sb, JOURNAL_FAST_COMMIT) && +		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, +					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { +		ext4_msg(sb, KERN_ERR, +			"Failed to set fast commit journal feature"); +		goto failed_mount_wq; +	} +  	/* We have now updated the journal if required, so we can  	 * validate the data journaling mode. */  	switch (test_opt(sb, DATA_FLAGS)) { @@ -4975,7 +4965,7 @@ no_journal:  	}  #ifdef CONFIG_UNICODE -	if (sbi->s_encoding) +	if (sb->s_encoding)  		sb->s_d_op = &ext4_dentry_ops;  #endif @@ -5184,7 +5174,7 @@ failed_mount:  		crypto_free_shash(sbi->s_chksum_driver);  #ifdef CONFIG_UNICODE -	utf8_unload(sbi->s_encoding); +	utf8_unload(sb->s_encoding);  #endif  #ifdef CONFIG_QUOTA @@ -5872,7 +5862,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  		goto restore_opts;  	} -	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) +	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))  		ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");  	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5886,7 +5876,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  	}  	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { -		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { +		if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {  			err = -EROFS;  			goto restore_opts;  		} @@ -6560,10 +6550,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,  	brelse(bh);  out:  	if (inode->i_size < off + len) { -		ext4_fc_track_range(inode, -			(inode->i_size > 0 ? inode->i_size - 1 : 0) -				>> inode->i_sb->s_blocksize_bits, -			(off + len) >> inode->i_sb->s_blocksize_bits);  		i_size_write(inode, off + len);  		EXT4_I(inode)->i_disksize = inode->i_size;  		err2 = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 5ff33d18996a..4e27fe6ed3ae 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -315,6 +315,7 @@ EXT4_ATTR_FEATURE(casefold);  EXT4_ATTR_FEATURE(verity);  #endif  EXT4_ATTR_FEATURE(metadata_csum_seed); +EXT4_ATTR_FEATURE(fast_commit);  static struct attribute *ext4_feat_attrs[] = {  	ATTR_LIST(lazy_itable_init), @@ -331,6 +332,7 @@ static struct attribute *ext4_feat_attrs[] = {  	ATTR_LIST(verity),  #endif  	ATTR_LIST(metadata_csum_seed), +	ATTR_LIST(fast_commit),  	NULL,  };  ATTRIBUTE_GROUPS(ext4_feat); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9cd2ecad07db..cc4f987687f3 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -77,7 +77,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,  	if (error)  		return error;  	if (!buffer_mapped(bh_result)) -		return -EIO; +		return -ENODATA;  	return 0;  } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8dff9cbd0a87..62d9081d1e26 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1301,12 +1301,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,  	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);  	ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); -	if (!ret && iomap.type == IOMAP_HOLE) { -		if (create) -			ret = gfs2_iomap_alloc(inode, &iomap, &mp); -		else -			ret = -ENODATA; -	} +	if (create && !ret && iomap.type == IOMAP_HOLE) +		ret = gfs2_iomap_alloc(inode, &iomap, &mp);  	release_metapath(&mp);  	if (ret)  		goto out; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5441c17562c5..d98a2e5dab9f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1078,7 +1078,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,  out_free:  	kfree(gl->gl_lksb.sb_lvbptr);  	kmem_cache_free(cachep, gl); -	atomic_dec(&sdp->sd_glock_disposal); +	if (atomic_dec_and_test(&sdp->sd_glock_disposal)) +		wake_up(&sdp->sd_glock_wait);  out:  	return ret; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index aa3f5236befb..67f2921ae8d4 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -165,6 +165,31 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)  }  /** + * gfs2_rgrp_metasync - sync out the metadata of a resource group + * @gl: the glock protecting the resource group + * + */ + +static int gfs2_rgrp_metasync(struct gfs2_glock *gl) +{ +	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; +	struct address_space *metamapping = &sdp->sd_aspace; +	struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); +	const unsigned bsize = sdp->sd_sb.sb_bsize; +	loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; +	loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; +	int error; + +	filemap_fdatawrite_range(metamapping, start, end); +	error = filemap_fdatawait_range(metamapping, start, end); +	WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); +	mapping_set_error(metamapping, error); +	if (error) +		gfs2_io_error(sdp); +	return error; +} + +/**   * rgrp_go_sync - sync out the metadata for this glock   * @gl: the glock   * @@ -176,11 +201,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)  static int rgrp_go_sync(struct gfs2_glock *gl)  {  	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; -	struct address_space *mapping = &sdp->sd_aspace;  	struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); -	const unsigned bsize = sdp->sd_sb.sb_bsize; -	loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; -	loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;  	int error;  	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -189,10 +210,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl)  	gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |  		       GFS2_LFC_RGRP_GO_SYNC); -	filemap_fdatawrite_range(mapping, start, end); -	error = filemap_fdatawait_range(mapping, start, end); -	WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); -	mapping_set_error(mapping, error); +	error = gfs2_rgrp_metasync(gl);  	if (!error)  		error = gfs2_ail_empty_gl(gl);  	gfs2_free_clones(rgd); @@ -266,7 +284,24 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip)  }  /** - * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * gfs2_inode_metasync - sync out the metadata of an inode + * @gl: the glock protecting the inode + * + */ +int gfs2_inode_metasync(struct gfs2_glock *gl) +{ +	struct address_space *metamapping = gfs2_glock2aspace(gl); +	int error; + +	filemap_fdatawrite(metamapping); +	error = filemap_fdatawait(metamapping); +	if (error) +		gfs2_io_error(gl->gl_name.ln_sbd); +	return error; +} + +/** + * inode_go_sync - Sync the dirty metadata of an inode   * @gl: the glock protecting the inode   *   */ @@ -297,8 +332,7 @@ static int inode_go_sync(struct gfs2_glock *gl)  		error = filemap_fdatawait(mapping);  		mapping_set_error(mapping, error);  	} -	ret = filemap_fdatawait(metamapping); -	mapping_set_error(metamapping, ret); +	ret = gfs2_inode_metasync(gl);  	if (!error)  		error = ret;  	gfs2_ail_empty_gl(gl); @@ -537,7 +571,18 @@ static int freeze_go_sync(struct gfs2_glock *gl)  	int error = 0;  	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; -	if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) { +	/* +	 * We need to check gl_state == LM_ST_SHARED here and not gl_req == +	 * LM_ST_EXCLUSIVE. That's because when any node does a freeze, +	 * all the nodes should have the freeze glock in SH mode and they all +	 * call do_xmote: One for EX and the others for UN. They ALL must +	 * freeze locally, and they ALL must queue freeze work. The freeze_work +	 * calls freeze_func, which tries to reacquire the freeze glock in SH, +	 * effectively waiting for the thaw on the node who holds it in EX. +	 * Once thawed, the work func acquires the freeze glock in +	 * SH and everybody goes back to thawed. +	 */ +	if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp)) {  		atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);  		error = freeze_super(sdp->sd_vfs);  		if (error) { diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 2dd192e85618..695898afcaf1 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -22,6 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;  extern const struct gfs2_glock_operations gfs2_journal_glops;  extern const struct gfs2_glock_operations *gfs2_glops_list[]; +extern int gfs2_inode_metasync(struct gfs2_glock *gl);  extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);  #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6774865f5b5b..077ccb1b3ccc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -180,7 +180,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,  		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);  		if (unlikely(error))  			goto fail; -		gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); +		if (blktype != GFS2_BLKST_UNLINKED) +			gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);  		glock_set_object(ip->i_iopen_gh.gh_gl, ip);  		gfs2_glock_put(io_gl);  		io_gl = NULL; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9133b3178677..2e9314091c81 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -132,6 +132,8 @@ __acquires(&sdp->sd_ail_lock)  		spin_unlock(&sdp->sd_ail_lock);  		ret = generic_writepages(mapping, wbc);  		spin_lock(&sdp->sd_ail_lock); +		if (ret == -ENODATA) /* if a jdata write into a new hole */ +			ret = 0; /* ignore it */  		if (ret || wbc->nr_to_write <= 0)  			break;  		return -EBUSY; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed69298dd824..3922b26264f5 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -22,6 +22,7 @@  #include "incore.h"  #include "inode.h"  #include "glock.h" +#include "glops.h"  #include "log.h"  #include "lops.h"  #include "meta_io.h" @@ -817,41 +818,19 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,  	return error;  } -/** - * gfs2_meta_sync - Sync all buffers associated with a glock - * @gl: The glock - * - */ - -void gfs2_meta_sync(struct gfs2_glock *gl) -{ -	struct address_space *mapping = gfs2_glock2aspace(gl); -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; -	int error; - -	if (mapping == NULL) -		mapping = &sdp->sd_aspace; - -	filemap_fdatawrite(mapping); -	error = filemap_fdatawait(mapping); - -	if (error) -		gfs2_io_error(gl->gl_name.ln_sbd); -} -  static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)  {  	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);  	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);  	if (error) { -		gfs2_meta_sync(ip->i_gl); +		gfs2_inode_metasync(ip->i_gl);  		return;  	}  	if (pass != 1)  		return; -	gfs2_meta_sync(ip->i_gl); +	gfs2_inode_metasync(ip->i_gl);  	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",  	        jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); @@ -1060,14 +1039,14 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)  	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);  	if (error) { -		gfs2_meta_sync(ip->i_gl); +		gfs2_inode_metasync(ip->i_gl);  		return;  	}  	if (pass != 1)  		return;  	/* data sync? */ -	gfs2_meta_sync(ip->i_gl); +	gfs2_inode_metasync(ip->i_gl);  	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",  		jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 4a3d8aecdf82..fbdbb08dcec6 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,8 +27,6 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);  extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);  extern int gfs2_find_jhead(struct gfs2_jdesc *jd,  			   struct gfs2_log_header_host *head, bool keep_cache); -extern void gfs2_meta_sync(struct gfs2_glock *gl); -  static inline unsigned int buf_limit(struct gfs2_sbd *sdp)  {  	unsigned int limit; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7a7e3c10a9a9..61fce59cb4d3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -633,8 +633,10 @@ static int init_statfs(struct gfs2_sbd *sdp)  	if (IS_ERR(sdp->sd_statfs_inode)) {  		error = PTR_ERR(sdp->sd_statfs_inode);  		fs_err(sdp, "can't read in statfs inode: %d\n", error); -		goto fail; +		goto out;  	} +	if (sdp->sd_args.ar_spectator) +		goto out;  	pn = gfs2_lookup_simple(master, "per_node");  	if (IS_ERR(pn)) { @@ -682,15 +684,17 @@ free_local:  	iput(pn);  put_statfs:  	iput(sdp->sd_statfs_inode); -fail: +out:  	return error;  }  /* Uninitialize and free up memory used by the list of statfs inodes */  static void uninit_statfs(struct gfs2_sbd *sdp)  { -	gfs2_glock_dq_uninit(&sdp->sd_sc_gh); -	free_local_statfs_inodes(sdp); +	if (!sdp->sd_args.ar_spectator) { +		gfs2_glock_dq_uninit(&sdp->sd_sc_gh); +		free_local_statfs_inodes(sdp); +	}  	iput(sdp->sd_statfs_inode);  } @@ -704,7 +708,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)  	if (undo) {  		jindex = 0; -		goto fail_jinode_gh; +		goto fail_statfs;  	}  	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b5cbe21efdfb..c26c68ebd29d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -349,7 +349,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd,  	mark_buffer_dirty(bh);  	brelse(bh); -	gfs2_meta_sync(ip->i_gl); +	gfs2_inode_metasync(ip->i_gl);  out:  	return error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ee491bb9c1cc..f7addc6197ed 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -719,9 +719,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)  		}  		gfs2_free_clones(rgd); +		return_all_reservations(rgd);  		kfree(rgd->rd_bits);  		rgd->rd_bits = NULL; -		return_all_reservations(rgd);  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);  	}  } @@ -1370,6 +1370,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; +	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) +		return -EROFS; +  	if (!blk_queue_discard(q))  		return -EOPNOTSUPP; @@ -2526,13 +2529,13 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)  	rbm.rgd = rgd;  	error = gfs2_rbm_from_block(&rbm, no_addr); -	if (WARN_ON_ONCE(error)) -		goto fail; - -	if (gfs2_testbit(&rbm, false) != type) -		error = -ESTALE; +	if (!WARN_ON_ONCE(error)) { +		if (gfs2_testbit(&rbm, false) != type) +			error = -ESTALE; +	}  	gfs2_glock_dq_uninit(&rgd_gh); +  fail:  	return error;  } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b285192bd6b3..b3d951ab8068 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -738,6 +738,7 @@ restart:  	gfs2_jindex_free(sdp);  	/*  Take apart glock structures and buffer lists  */  	gfs2_gl_hash_clear(sdp); +	truncate_inode_pages_final(&sdp->sd_aspace);  	gfs2_delete_debugfs_file(sdp);  	/*  Unmount the locking protocol  */  	gfs2_lm_unmount(sdp); diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index dcc2aab1b2c4..4ba45caf5939 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -60,7 +60,7 @@ struct hfs_bnode {  	wait_queue_head_t lock_wq;  	atomic_t refcnt;  	unsigned int page_offset; -	struct page *page[0]; +	struct page *page[];  };  #define HFS_BNODE_ERROR		0 diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 3b03fff68543..a92de5199ec3 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -117,7 +117,7 @@ struct hfs_bnode {  	wait_queue_head_t lock_wq;  	atomic_t refcnt;  	unsigned int page_offset; -	struct page *page[0]; +	struct page *page[];  };  #define HFS_BNODE_LOCK		0 diff --git a/fs/io-wq.c b/fs/io-wq.c index 02894df7656d..b53c055bea6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker,  		current->files = work->identity->files;  		current->nsproxy = work->identity->nsproxy;  		task_unlock(current); +		if (!work->identity->files) { +			/* failed grabbing files, ensure work gets cancelled */ +			work->flags |= IO_WQ_WORK_CANCEL; +		}  	}  	if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)  		current->fs = work->identity->fs; diff --git a/fs/io_uring.c b/fs/io_uring.c index b42dfa0243bf..1023f7b44cea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -205,6 +205,7 @@ struct fixed_file_ref_node {  	struct list_head		file_list;  	struct fixed_file_data		*file_data;  	struct llist_node		llist; +	bool				done;  };  struct fixed_file_data { @@ -478,6 +479,7 @@ struct io_sr_msg {  struct io_open {  	struct file			*file;  	int				dfd; +	bool				ignore_nonblock;  	struct filename			*filename;  	struct open_how			how;  	unsigned long			nofile; @@ -995,20 +997,33 @@ static void io_sq_thread_drop_mm(void)  	if (mm) {  		kthread_unuse_mm(mm);  		mmput(mm); +		current->mm = NULL;  	}  }  static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)  { -	if (!current->mm) { -		if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || -			     !ctx->sqo_task->mm || -			     !mmget_not_zero(ctx->sqo_task->mm))) -			return -EFAULT; -		kthread_use_mm(ctx->sqo_task->mm); +	struct mm_struct *mm; + +	if (current->mm) +		return 0; + +	/* Should never happen */ +	if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL))) +		return -EFAULT; + +	task_lock(ctx->sqo_task); +	mm = ctx->sqo_task->mm; +	if (unlikely(!mm || !mmget_not_zero(mm))) +		mm = NULL; +	task_unlock(ctx->sqo_task); + +	if (mm) { +		kthread_use_mm(mm); +		return 0;  	} -	return 0; +	return -EFAULT;  }  static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, @@ -1274,9 +1289,12 @@ static bool io_identity_cow(struct io_kiocb *req)  	/* add one for this request */  	refcount_inc(&id->count); -	/* drop old identity, assign new one. one ref for req, one for tctx */ -	if (req->work.identity != tctx->identity && -	    refcount_sub_and_test(2, &req->work.identity->count)) +	/* drop tctx and req identity references, if needed */ +	if (tctx->identity != &tctx->__identity && +	    refcount_dec_and_test(&tctx->identity->count)) +		kfree(tctx->identity); +	if (req->work.identity != &tctx->__identity && +	    refcount_dec_and_test(&req->work.identity->count))  		kfree(req->work.identity);  	req->work.identity = id; @@ -1295,22 +1313,6 @@ static bool io_grab_identity(struct io_kiocb *req)  			return false;  		req->work.flags |= IO_WQ_WORK_FSIZE;  	} - -	if (!(req->work.flags & IO_WQ_WORK_FILES) && -	    (def->work_flags & IO_WQ_WORK_FILES) && -	    !(req->flags & REQ_F_NO_FILE_TABLE)) { -		if (id->files != current->files || -		    id->nsproxy != current->nsproxy) -			return false; -		atomic_inc(&id->files->count); -		get_nsproxy(id->nsproxy); -		req->flags |= REQ_F_INFLIGHT; - -		spin_lock_irq(&ctx->inflight_lock); -		list_add(&req->inflight_entry, &ctx->inflight_list); -		spin_unlock_irq(&ctx->inflight_lock); -		req->work.flags |= IO_WQ_WORK_FILES; -	}  #ifdef CONFIG_BLK_CGROUP  	if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&  	    (def->work_flags & IO_WQ_WORK_BLKCG)) { @@ -1352,6 +1354,21 @@ static bool io_grab_identity(struct io_kiocb *req)  		}  		spin_unlock(¤t->fs->lock);  	} +	if (!(req->work.flags & IO_WQ_WORK_FILES) && +	    (def->work_flags & IO_WQ_WORK_FILES) && +	    !(req->flags & REQ_F_NO_FILE_TABLE)) { +		if (id->files != current->files || +		    id->nsproxy != current->nsproxy) +			return false; +		atomic_inc(&id->files->count); +		get_nsproxy(id->nsproxy); +		req->flags |= REQ_F_INFLIGHT; + +		spin_lock_irq(&ctx->inflight_lock); +		list_add(&req->inflight_entry, &ctx->inflight_list); +		spin_unlock_irq(&ctx->inflight_lock); +		req->work.flags |= IO_WQ_WORK_FILES; +	}  	return true;  } @@ -1365,6 +1382,9 @@ static void io_prep_async_work(struct io_kiocb *req)  	io_req_init_async(req);  	id = req->work.identity; +	if (req->flags & REQ_F_FORCE_ASYNC) +		req->work.flags |= IO_WQ_WORK_CONCURRENT; +  	if (req->flags & REQ_F_ISREG) {  		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))  			io_wq_hash_work(&req->work, file_inode(req->file)); @@ -1574,14 +1594,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)  	}  } -static inline bool io_match_files(struct io_kiocb *req, -				       struct files_struct *files) +static inline bool __io_match_files(struct io_kiocb *req, +				    struct files_struct *files) +{ +	return ((req->flags & REQ_F_WORK_INITIALIZED) && +	        (req->work.flags & IO_WQ_WORK_FILES)) && +		req->work.identity->files == files; +} + +static bool io_match_files(struct io_kiocb *req, +			   struct files_struct *files)  { +	struct io_kiocb *link; +  	if (!files)  		return true; -	if ((req->flags & REQ_F_WORK_INITIALIZED) && -	    (req->work.flags & IO_WQ_WORK_FILES)) -		return req->work.identity->files == files; +	if (__io_match_files(req, files)) +		return true; +	if (req->flags & REQ_F_LINK_HEAD) { +		list_for_each_entry(link, &req->link_list, link_list) { +			if (__io_match_files(link, files)) +				return true; +		} +	}  	return false;  } @@ -1665,7 +1700,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)  		WRITE_ONCE(cqe->user_data, req->user_data);  		WRITE_ONCE(cqe->res, res);  		WRITE_ONCE(cqe->flags, cflags); -	} else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { +	} else if (ctx->cq_overflow_flushed || +		   atomic_read(&req->task->io_uring->in_idle)) {  		/*  		 * If we're in ring overflow flush mode, or in task cancel mode,  		 * then we cannot store the request for later flushing, we need @@ -1835,7 +1871,7 @@ static void __io_free_req(struct io_kiocb *req)  	io_dismantle_req(req);  	percpu_counter_dec(&tctx->inflight); -	if (tctx->in_idle) +	if (atomic_read(&tctx->in_idle))  		wake_up(&tctx->wait);  	put_task_struct(req->task); @@ -1846,59 +1882,39 @@ static void __io_free_req(struct io_kiocb *req)  	percpu_ref_put(&ctx->refs);  } -static bool io_link_cancel_timeout(struct io_kiocb *req) +static void io_kill_linked_timeout(struct io_kiocb *req)  { -	struct io_timeout_data *io = req->async_data;  	struct io_ring_ctx *ctx = req->ctx; -	int ret; - -	ret = hrtimer_try_to_cancel(&io->timer); -	if (ret != -1) { -		io_cqring_fill_event(req, -ECANCELED); -		io_commit_cqring(ctx); -		req->flags &= ~REQ_F_LINK_HEAD; -		io_put_req_deferred(req, 1); -		return true; -	} - -	return false; -} - -static bool __io_kill_linked_timeout(struct io_kiocb *req) -{  	struct io_kiocb *link; -	bool wake_ev; +	bool cancelled = false; +	unsigned long flags; -	if (list_empty(&req->link_list)) -		return false; -	link = list_first_entry(&req->link_list, struct io_kiocb, link_list); -	if (link->opcode != IORING_OP_LINK_TIMEOUT) -		return false; +	spin_lock_irqsave(&ctx->completion_lock, flags); +	link = list_first_entry_or_null(&req->link_list, struct io_kiocb, +					link_list);  	/*  	 * Can happen if a linked timeout fired and link had been like  	 * req -> link t-out -> link t-out [-> ...]  	 */ -	if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE)) -		return false; +	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { +		struct io_timeout_data *io = link->async_data; +		int ret; -	list_del_init(&link->link_list); -	wake_ev = io_link_cancel_timeout(link); +		list_del_init(&link->link_list); +		ret = hrtimer_try_to_cancel(&io->timer); +		if (ret != -1) { +			io_cqring_fill_event(link, -ECANCELED); +			io_commit_cqring(ctx); +			cancelled = true; +		} +	}  	req->flags &= ~REQ_F_LINK_TIMEOUT; -	return wake_ev; -} - -static void io_kill_linked_timeout(struct io_kiocb *req) -{ -	struct io_ring_ctx *ctx = req->ctx; -	unsigned long flags; -	bool wake_ev; - -	spin_lock_irqsave(&ctx->completion_lock, flags); -	wake_ev = __io_kill_linked_timeout(req);  	spin_unlock_irqrestore(&ctx->completion_lock, flags); -	if (wake_ev) +	if (cancelled) {  		io_cqring_ev_posted(ctx); +		io_put_req(link); +	}  }  static struct io_kiocb *io_req_link_next(struct io_kiocb *req) @@ -2562,7 +2578,6 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)  	}  end_req:  	req_set_fail_links(req); -	io_req_complete(req, ret);  	return false;  }  #endif @@ -3177,7 +3192,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,  	rw->free_iovec = iovec;  	rw->bytes_done = 0;  	/* can only be fixed buffers, no need to do anything */ -	if (iter->type == ITER_BVEC) +	if (iov_iter_is_bvec(iter))  		return;  	if (!iovec) {  		unsigned iov_off = 0; @@ -3532,8 +3547,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,  	 * we return to userspace.  	 */  	if (req->flags & REQ_F_ISREG) { -		__sb_start_write(file_inode(req->file)->i_sb, -					SB_FREEZE_WRITE, true); +		sb_start_write(file_inode(req->file)->i_sb);  		__sb_writers_release(file_inode(req->file)->i_sb,  					SB_FREEZE_WRITE);  	} @@ -3781,6 +3795,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  		return ret;  	}  	req->open.nofile = rlimit(RLIMIT_NOFILE); +	req->open.ignore_nonblock = false;  	req->flags |= REQ_F_NEED_CLEANUP;  	return 0;  } @@ -3824,7 +3839,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)  	struct file *file;  	int ret; -	if (force_nonblock) +	if (force_nonblock && !req->open.ignore_nonblock)  		return -EAGAIN;  	ret = build_open_flags(&req->open.how, &op); @@ -3839,6 +3854,21 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)  	if (IS_ERR(file)) {  		put_unused_fd(ret);  		ret = PTR_ERR(file); +		/* +		 * A work-around to ensure that /proc/self works that way +		 * that it should - if we get -EOPNOTSUPP back, then assume +		 * that proc_self_get_link() failed us because we're in async +		 * context. We should be safe to retry this from the task +		 * itself with force_nonblock == false set, as it should not +		 * block on lookup. Would be nice to know this upfront and +		 * avoid the async dance, but doesn't seem feasible. +		 */ +		if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) { +			req->open.ignore_nonblock = true; +			refcount_inc(&req->refs); +			io_req_task_queue(req); +			return 0; +		}  	} else {  		fsnotify_open(file);  		fd_install(ret, file); @@ -4977,8 +5007,10 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,  		/* make sure double remove sees this as being gone */  		wait->private = NULL;  		spin_unlock(&poll->head->lock); -		if (!done) -			__io_async_wake(req, poll, mask, io_poll_task_func); +		if (!done) { +			/* use wait func handler, so it matches the rq type */ +			poll->wait.func(&poll->wait, mode, sync, key); +		}  	}  	refcount_dec(&req->refs);  	return 1; @@ -6180,7 +6212,6 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)  static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)  {  	struct io_kiocb *linked_timeout; -	struct io_kiocb *nxt;  	const struct cred *old_creds = NULL;  	int ret; @@ -6206,7 +6237,6 @@ again:  	 */  	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {  		if (!io_arm_poll_handler(req)) { -punt:  			/*  			 * Queued up for async execution, worker will release  			 * submit reference when the iocb is actually submitted. @@ -6216,33 +6246,25 @@ punt:  		if (linked_timeout)  			io_queue_linked_timeout(linked_timeout); -		goto exit; -	} +	} else if (likely(!ret)) { +		/* drop submission reference */ +		req = io_put_req_find_next(req); +		if (linked_timeout) +			io_queue_linked_timeout(linked_timeout); -	if (unlikely(ret)) { +		if (req) { +			if (!(req->flags & REQ_F_FORCE_ASYNC)) +				goto again; +			io_queue_async_work(req); +		} +	} else {  		/* un-prep timeout, so it'll be killed as any other linked */  		req->flags &= ~REQ_F_LINK_TIMEOUT;  		req_set_fail_links(req);  		io_put_req(req);  		io_req_complete(req, ret); -		goto exit;  	} -	/* drop submission reference */ -	nxt = io_put_req_find_next(req); -	if (linked_timeout) -		io_queue_linked_timeout(linked_timeout); - -	if (nxt) { -		req = nxt; - -		if (req->flags & REQ_F_FORCE_ASYNC) { -			linked_timeout = NULL; -			goto punt; -		} -		goto again; -	} -exit:  	if (old_creds)  		revert_creds(old_creds);  } @@ -6266,13 +6288,6 @@ fail_req:  			if (unlikely(ret))  				goto fail_req;  		} - -		/* -		 * Never try inline submit of IOSQE_ASYNC is set, go straight -		 * to async execution. -		 */ -		io_req_init_async(req); -		req->work.flags |= IO_WQ_WORK_CONCURRENT;  		io_queue_async_work(req);  	} else {  		if (sqe) { @@ -6958,9 +6973,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  		return -ENXIO;  	spin_lock(&data->lock); -	if (!list_empty(&data->ref_list)) -		ref_node = list_first_entry(&data->ref_list, -				struct fixed_file_ref_node, node); +	ref_node = data->node;  	spin_unlock(&data->lock);  	if (ref_node)  		percpu_ref_kill(&ref_node->refs); @@ -7309,10 +7322,6 @@ static void __io_file_put_work(struct fixed_file_ref_node *ref_node)  		kfree(pfile);  	} -	spin_lock(&file_data->lock); -	list_del(&ref_node->node); -	spin_unlock(&file_data->lock); -  	percpu_ref_exit(&ref_node->refs);  	kfree(ref_node);  	percpu_ref_put(&file_data->refs); @@ -7339,17 +7348,32 @@ static void io_file_put_work(struct work_struct *work)  static void io_file_data_ref_zero(struct percpu_ref *ref)  {  	struct fixed_file_ref_node *ref_node; +	struct fixed_file_data *data;  	struct io_ring_ctx *ctx; -	bool first_add; +	bool first_add = false;  	int delay = HZ;  	ref_node = container_of(ref, struct fixed_file_ref_node, refs); -	ctx = ref_node->file_data->ctx; +	data = ref_node->file_data; +	ctx = data->ctx; + +	spin_lock(&data->lock); +	ref_node->done = true; -	if (percpu_ref_is_dying(&ctx->file_data->refs)) +	while (!list_empty(&data->ref_list)) { +		ref_node = list_first_entry(&data->ref_list, +					struct fixed_file_ref_node, node); +		/* recycle ref nodes in order */ +		if (!ref_node->done) +			break; +		list_del(&ref_node->node); +		first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); +	} +	spin_unlock(&data->lock); + +	if (percpu_ref_is_dying(&data->refs))  		delay = 0; -	first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);  	if (!delay)  		mod_delayed_work(system_wq, &ctx->file_put_work, 0);  	else if (first_add) @@ -7373,6 +7397,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(  	INIT_LIST_HEAD(&ref_node->node);  	INIT_LIST_HEAD(&ref_node->file_list);  	ref_node->file_data = ctx->file_data; +	ref_node->done = false;  	return ref_node;  } @@ -7468,7 +7493,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,  	file_data->node = ref_node;  	spin_lock(&file_data->lock); -	list_add(&ref_node->node, &file_data->ref_list); +	list_add_tail(&ref_node->node, &file_data->ref_list);  	spin_unlock(&file_data->lock);  	percpu_ref_get(&file_data->refs);  	return ret; @@ -7627,7 +7652,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,  	if (needs_switch) {  		percpu_ref_kill(&data->node->refs);  		spin_lock(&data->lock); -		list_add(&ref_node->node, &data->ref_list); +		list_add_tail(&ref_node->node, &data->ref_list);  		data->node = ref_node;  		spin_unlock(&data->lock);  		percpu_ref_get(&ctx->file_data->refs); @@ -7727,7 +7752,8 @@ static int io_uring_alloc_task_context(struct task_struct *task)  	xa_init(&tctx->xa);  	init_waitqueue_head(&tctx->wait);  	tctx->last = NULL; -	tctx->in_idle = 0; +	atomic_set(&tctx->in_idle, 0); +	tctx->sqpoll = false;  	io_init_identity(&tctx->__identity);  	tctx->identity = &tctx->__identity;  	task->io_uring = tctx; @@ -8420,22 +8446,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)  	return false;  } -static bool io_match_link_files(struct io_kiocb *req, -				struct files_struct *files) -{ -	struct io_kiocb *link; - -	if (io_match_files(req, files)) -		return true; -	if (req->flags & REQ_F_LINK_HEAD) { -		list_for_each_entry(link, &req->link_list, link_list) { -			if (io_match_files(link, files)) -				return true; -		} -	} -	return false; -} -  /*   * We're looking to cancel 'req' because it's holding on to our files, but   * 'req' could be a link to another request. See if it is, and cancel that @@ -8485,7 +8495,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx,  static bool io_cancel_link_cb(struct io_wq_work *work, void *data)  { -	return io_match_link(container_of(work, struct io_kiocb, work), data); +	struct io_kiocb *req = container_of(work, struct io_kiocb, work); +	bool ret; + +	if (req->flags & REQ_F_LINK_TIMEOUT) { +		unsigned long flags; +		struct io_ring_ctx *ctx = req->ctx; + +		/* protect against races with linked timeouts */ +		spin_lock_irqsave(&ctx->completion_lock, flags); +		ret = io_match_link(req, data); +		spin_unlock_irqrestore(&ctx->completion_lock, flags); +	} else { +		ret = io_match_link(req, data); +	} +	return ret;  }  static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -8511,6 +8535,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)  }  static void io_cancel_defer_files(struct io_ring_ctx *ctx, +				  struct task_struct *task,  				  struct files_struct *files)  {  	struct io_defer_entry *de = NULL; @@ -8518,7 +8543,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,  	spin_lock_irq(&ctx->completion_lock);  	list_for_each_entry_reverse(de, &ctx->defer_list, list) { -		if (io_match_link_files(de->req, files)) { +		if (io_task_match(de->req, task) && +		    io_match_files(de->req, files)) {  			list_cut_position(&list, &ctx->defer_list, &de->list);  			break;  		} @@ -8544,7 +8570,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx,  	if (list_empty_careful(&ctx->inflight_list))  		return false; -	io_cancel_defer_files(ctx, files);  	/* cancel all at once, should be faster than doing it one by one*/  	io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8630,8 +8655,16 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,  {  	struct task_struct *task = current; -	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) +	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {  		task = ctx->sq_data->thread; +		atomic_inc(&task->io_uring->in_idle); +		io_sq_thread_park(ctx->sq_data); +	} + +	if (files) +		io_cancel_defer_files(ctx, NULL, files); +	else +		io_cancel_defer_files(ctx, task, NULL);  	io_cqring_overflow_flush(ctx, true, task, files); @@ -8639,12 +8672,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,  		io_run_task_work();  		cond_resched();  	} + +	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { +		atomic_dec(&task->io_uring->in_idle); +		/* +		 * If the files that are going away are the ones in the thread +		 * identity, clear them out. +		 */ +		if (task->io_uring->identity->files == files) +			task->io_uring->identity->files = NULL; +		io_sq_thread_unpark(ctx->sq_data); +	}  }  /*   * Note that this task has used io_uring. We use it for cancelation purposes.   */ -static int io_uring_add_task_file(struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)  {  	struct io_uring_task *tctx = current->io_uring; @@ -8666,6 +8710,14 @@ static int io_uring_add_task_file(struct file *file)  		tctx->last = file;  	} +	/* +	 * This is race safe in that the task itself is doing this, hence it +	 * cannot be going through the exit/cancel paths at the same time. +	 * This cannot be modified while exit/cancel is running. +	 */ +	if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) +		tctx->sqpoll = true; +  	return 0;  } @@ -8707,7 +8759,7 @@ void __io_uring_files_cancel(struct files_struct *files)  	unsigned long index;  	/* make sure overflow events are dropped */ -	tctx->in_idle = true; +	atomic_inc(&tctx->in_idle);  	xa_for_each(&tctx->xa, index, file) {  		struct io_ring_ctx *ctx = file->private_data; @@ -8716,6 +8768,35 @@ void __io_uring_files_cancel(struct files_struct *files)  		if (files)  			io_uring_del_task_file(file);  	} + +	atomic_dec(&tctx->in_idle); +} + +static s64 tctx_inflight(struct io_uring_task *tctx) +{ +	unsigned long index; +	struct file *file; +	s64 inflight; + +	inflight = percpu_counter_sum(&tctx->inflight); +	if (!tctx->sqpoll) +		return inflight; + +	/* +	 * If we have SQPOLL rings, then we need to iterate and find them, and +	 * add the pending count for those. +	 */ +	xa_for_each(&tctx->xa, index, file) { +		struct io_ring_ctx *ctx = file->private_data; + +		if (ctx->flags & IORING_SETUP_SQPOLL) { +			struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + +			inflight += percpu_counter_sum(&__tctx->inflight); +		} +	} + +	return inflight;  }  /* @@ -8729,11 +8810,11 @@ void __io_uring_task_cancel(void)  	s64 inflight;  	/* make sure overflow events are dropped */ -	tctx->in_idle = true; +	atomic_inc(&tctx->in_idle);  	do {  		/* read completions before cancelations */ -		inflight = percpu_counter_sum(&tctx->inflight); +		inflight = tctx_inflight(tctx);  		if (!inflight)  			break;  		__io_uring_files_cancel(NULL); @@ -8744,13 +8825,13 @@ void __io_uring_task_cancel(void)  		 * If we've seen completions, retry. This avoids a race where  		 * a completion comes in before we did prepare_to_wait().  		 */ -		if (inflight != percpu_counter_sum(&tctx->inflight)) +		if (inflight != tctx_inflight(tctx))  			continue;  		schedule();  	} while (1);  	finish_wait(&tctx->wait, &wait); -	tctx->in_idle = false; +	atomic_dec(&tctx->in_idle);  }  static int io_uring_flush(struct file *file, void *data) @@ -8895,7 +8976,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  			io_sqpoll_wait_sq(ctx);  		submitted = to_submit;  	} else if (to_submit) { -		ret = io_uring_add_task_file(f.file); +		ret = io_uring_add_task_file(ctx, f.file);  		if (unlikely(ret))  			goto out;  		mutex_lock(&ctx->uring_lock); @@ -8932,7 +9013,8 @@ out_fput:  #ifdef CONFIG_PROC_FS  static int io_uring_show_cred(int id, void *p, void *data)  { -	const struct cred *cred = p; +	struct io_identity *iod = p; +	const struct cred *cred = iod->creds;  	struct seq_file *m = data;  	struct user_namespace *uns = seq_user_ns(m);  	struct group_info *gi; @@ -9124,7 +9206,7 @@ err_fd:  #if defined(CONFIG_UNIX)  	ctx->ring_sock->file = file;  #endif -	if (unlikely(io_uring_add_task_file(file))) { +	if (unlikely(io_uring_add_task_file(ctx, file))) {  		file = ERR_PTR(-ENOMEM);  		goto err_fd;  	} @@ -9169,7 +9251,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,  		 * to a power-of-two, if it isn't already. We do NOT impose  		 * any cq vs sq ring sizing.  		 */ -		if (p->cq_entries < p->sq_entries) +		if (!p->cq_entries)  			return -EINVAL;  		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {  			if (!(p->flags & IORING_SETUP_CLAMP)) @@ -9177,6 +9259,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,  			p->cq_entries = IORING_MAX_CQ_ENTRIES;  		}  		p->cq_entries = roundup_pow_of_two(p->cq_entries); +		if (p->cq_entries < p->sq_entries) +			return -EINVAL;  	} else {  		p->cq_entries = 2 * p->sq_entries;  	} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8180061b9e16..10cc7979ce38 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1374,6 +1374,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,  	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));  	WARN_ON_ONCE(!PageLocked(page));  	WARN_ON_ONCE(PageWriteback(page)); +	WARN_ON_ONCE(PageDirty(page));  	/*  	 * We cannot cancel the ioend directly here on error.  We may have @@ -1382,33 +1383,22 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,  	 * appropriately.  	 */  	if (unlikely(error)) { +		/* +		 * Let the filesystem know what portion of the current page +		 * failed to map. If the page wasn't been added to ioend, it +		 * won't be affected by I/O completion and we must unlock it +		 * now. +		 */ +		if (wpc->ops->discard_page) +			wpc->ops->discard_page(page, file_offset);  		if (!count) { -			/* -			 * If the current page hasn't been added to ioend, it -			 * won't be affected by I/O completions and we must -			 * discard and unlock it right here. -			 */ -			if (wpc->ops->discard_page) -				wpc->ops->discard_page(page);  			ClearPageUptodate(page);  			unlock_page(page);  			goto done;  		} - -		/* -		 * If the page was not fully cleaned, we need to ensure that the -		 * higher layers come back to it correctly.  That means we need -		 * to keep the page dirty, and for WB_SYNC_ALL writeback we need -		 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed -		 * so another attempt to write this page in this writeback sweep -		 * will be made. -		 */ -		set_page_writeback_keepwrite(page); -	} else { -		clear_page_dirty_for_io(page); -		set_page_writeback(page);  	} +	set_page_writeback(page);  	unlock_page(page);  	/* diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index 1558cf22ef8a..ee9660e9671c 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -22,7 +22,7 @@ struct SU_ER_s {  	__u8 len_des;  	__u8 len_src;  	__u8 ext_ver; -	__u8 data[0]; +	__u8 data[];  } __attribute__ ((packed));  struct RR_RR_s { @@ -44,7 +44,7 @@ struct RR_PN_s {  struct SL_component {  	__u8 flags;  	__u8 len; -	__u8 text[0]; +	__u8 text[];  } __attribute__ ((packed));  struct RR_SL_s { @@ -54,7 +54,7 @@ struct RR_SL_s {  struct RR_NM_s {  	__u8 flags; -	char name[0]; +	char name[];  } __attribute__ ((packed));  struct RR_CL_s { @@ -71,7 +71,7 @@ struct stamp {  struct RR_TF_s {  	__u8 flags; -	struct stamp times[0];	/* Variable number of these beasts */ +	struct stamp times[];	/* Variable number of these beasts */  } __attribute__ ((packed));  /* Linux-specific extension for transparent decompression */ diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 263f02ad8ebf..472932b9e6bc 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)   * for a checkpoint to free up some space in the log.   */  void __jbd2_log_wait_for_space(journal_t *journal) +__acquires(&journal->j_state_lock) +__releases(&journal->j_state_lock)  {  	int nblocks, space_left;  	/* assert_spin_locked(&journal->j_state_lock); */ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fa688e163a80..b121d7d434c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)  		schedule();  		write_lock(&journal->j_state_lock);  		finish_wait(&journal->j_fc_wait, &wait); +		/* +		 * TODO: by blocking fast commits here, we are increasing +		 * fsync() latency slightly. Strictly speaking, we don't need +		 * to block fast commits until the transaction enters T_FLUSH +		 * state. So an optimization is possible where we block new fast +		 * commits here and wait for existing ones to complete +		 * just before we enter T_FLUSH. That way, the existing fast +		 * commits and this full commit can proceed parallely. +		 */  	}  	write_unlock(&journal->j_state_lock); @@ -801,7 +810,7 @@ start_journal_io:  		if (first_block < journal->j_tail)  			freed += journal->j_last - journal->j_first;  		/* Update tail only if we free significant amount of space */ -		if (freed < journal->j_maxlen / 4) +		if (freed < jbd2_journal_get_max_txn_bufs(journal))  			update_tail = 0;  	}  	J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c7c42bd530f..188f79d76988 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -566,12 +566,14 @@ static int __jbd2_journal_force_commit(journal_t *journal)  }  /** - * Force and wait upon a commit if the calling process is not within - * transaction.  This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. + * jbd2_journal_force_commit_nested - Force and wait upon a commit if the + * calling process is not within transaction.   *   * @journal: journal to force   * Returns true if progress was made. + * + * This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space.   */  int jbd2_journal_force_commit_nested(journal_t *journal)  { @@ -582,7 +584,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)  }  /** - * int journal_force_commit() - force any uncommitted transactions + * jbd2_journal_force_commit() - force any uncommitted transactions   * @journal: journal to force   *   * Caller want unconditional commit. We can only force the running transaction @@ -727,6 +729,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)   */  int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)  { +	if (unlikely(is_journal_aborted(journal))) +		return -EIO;  	/*  	 * Fast commits only allowed if at least one full commit has  	 * been processed. @@ -734,10 +738,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)  	if (!journal->j_stats.ts_tid)  		return -EINVAL; -	if (tid <= journal->j_commit_sequence) +	write_lock(&journal->j_state_lock); +	if (tid <= journal->j_commit_sequence) { +		write_unlock(&journal->j_state_lock);  		return -EALREADY; +	} -	write_lock(&journal->j_state_lock);  	if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||  	    (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {  		DEFINE_WAIT(wait); @@ -777,13 +783,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)  int jbd2_fc_end_commit(journal_t *journal)  { -	return __jbd2_fc_end_commit(journal, 0, 0); +	return __jbd2_fc_end_commit(journal, 0, false);  }  EXPORT_SYMBOL(jbd2_fc_end_commit); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +int jbd2_fc_end_commit_fallback(journal_t *journal)  { -	return __jbd2_fc_end_commit(journal, tid, 1); +	tid_t tid; + +	read_lock(&journal->j_state_lock); +	tid = journal->j_running_transaction ? +		journal->j_running_transaction->t_tid : 0; +	read_unlock(&journal->j_state_lock); +	return __jbd2_fc_end_commit(journal, tid, true);  }  EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); @@ -865,7 +877,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)  	int fc_off;  	*bh_out = NULL; -	write_lock(&journal->j_state_lock);  	if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {  		fc_off = journal->j_fc_off; @@ -874,7 +885,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)  	} else {  		ret = -EINVAL;  	} -	write_unlock(&journal->j_state_lock);  	if (ret)  		return ret; @@ -887,11 +897,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)  	if (!bh)  		return -ENOMEM; -	lock_buffer(bh); -	clear_buffer_uptodate(bh); -	set_buffer_dirty(bh); -	unlock_buffer(bh);  	journal->j_fc_wbuf[fc_off] = bh;  	*bh_out = bh; @@ -909,9 +915,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)  	struct buffer_head *bh;  	int i, j_fc_off; -	read_lock(&journal->j_state_lock);  	j_fc_off = journal->j_fc_off; -	read_unlock(&journal->j_state_lock);  	/*  	 * Wait in reverse order to minimize chances of us being woken up before @@ -939,9 +943,7 @@ int jbd2_fc_release_bufs(journal_t *journal)  	struct buffer_head *bh;  	int i, j_fc_off; -	read_lock(&journal->j_state_lock);  	j_fc_off = journal->j_fc_off; -	read_unlock(&journal->j_state_lock);  	/*  	 * Wait in reverse order to minimize chances of us being woken up before @@ -1348,23 +1350,16 @@ static journal_t *journal_init_common(struct block_device *bdev,  	journal->j_dev = bdev;  	journal->j_fs_dev = fs_dev;  	journal->j_blk_offset = start; -	journal->j_maxlen = len; +	journal->j_total_len = len;  	/* We need enough buffers to write out full descriptor block. */  	n = journal->j_blocksize / jbd2_min_tag_size();  	journal->j_wbufsize = n; +	journal->j_fc_wbuf = NULL;  	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),  					GFP_KERNEL);  	if (!journal->j_wbuf)  		goto err_cleanup; -	if (journal->j_fc_wbufsize > 0) { -		journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, -					sizeof(struct buffer_head *), -					GFP_KERNEL); -		if (!journal->j_fc_wbuf) -			goto err_cleanup; -	} -  	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);  	if (!bh) {  		pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1378,23 +1373,11 @@ static journal_t *journal_init_common(struct block_device *bdev,  err_cleanup:  	kfree(journal->j_wbuf); -	kfree(journal->j_fc_wbuf);  	jbd2_journal_destroy_revoke(journal);  	kfree(journal);  	return NULL;  } -int jbd2_fc_init(journal_t *journal, int num_fc_blks) -{ -	journal->j_fc_wbufsize = num_fc_blks; -	journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, -				sizeof(struct buffer_head *), GFP_KERNEL); -	if (!journal->j_fc_wbuf) -		return -ENOMEM; -	return 0; -} -EXPORT_SYMBOL(jbd2_fc_init); -  /* jbd2_journal_init_dev and jbd2_journal_init_inode:   *   * Create a journal structure assigned some fixed set of disk blocks to @@ -1512,16 +1495,7 @@ static int journal_reset(journal_t *journal)  	}  	journal->j_first = first; - -	if (jbd2_has_feature_fast_commit(journal) && -	    journal->j_fc_wbufsize > 0) { -		journal->j_fc_last = last; -		journal->j_last = last - journal->j_fc_wbufsize; -		journal->j_fc_first = journal->j_last + 1; -		journal->j_fc_off = 0; -	} else { -		journal->j_last = last; -	} +	journal->j_last = last;  	journal->j_head = journal->j_first;  	journal->j_tail = journal->j_first; @@ -1531,7 +1505,14 @@ static int journal_reset(journal_t *journal)  	journal->j_commit_sequence = journal->j_transaction_sequence - 1;  	journal->j_commit_request = journal->j_commit_sequence; -	journal->j_max_transaction_buffers = journal->j_maxlen / 4; +	journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); + +	/* +	 * Now that journal recovery is done, turn fast commits off here. This +	 * way, if fast commit was enabled before the crash but if now FS has +	 * disabled it, we don't enable fast commits. +	 */ +	jbd2_clear_feature_fast_commit(journal);  	/*  	 * As a special case, if the on-disk copy is already marked as needing @@ -1792,15 +1773,15 @@ static int journal_get_superblock(journal_t *journal)  		goto out;  	} -	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) -		journal->j_maxlen = be32_to_cpu(sb->s_maxlen); -	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { +	if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) +		journal->j_total_len = be32_to_cpu(sb->s_maxlen); +	else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {  		printk(KERN_WARNING "JBD2: journal file too short\n");  		goto out;  	}  	if (be32_to_cpu(sb->s_first) == 0 || -	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) { +	    be32_to_cpu(sb->s_first) >= journal->j_total_len) {  		printk(KERN_WARNING  			"JBD2: Invalid start block of journal: %u\n",  			be32_to_cpu(sb->s_first)); @@ -1872,6 +1853,7 @@ static int load_superblock(journal_t *journal)  {  	int err;  	journal_superblock_t *sb; +	int num_fc_blocks;  	err = journal_get_superblock(journal);  	if (err) @@ -1883,15 +1865,17 @@ static int load_superblock(journal_t *journal)  	journal->j_tail = be32_to_cpu(sb->s_start);  	journal->j_first = be32_to_cpu(sb->s_first);  	journal->j_errno = be32_to_cpu(sb->s_errno); +	journal->j_last = be32_to_cpu(sb->s_maxlen); -	if (jbd2_has_feature_fast_commit(journal) && -	    journal->j_fc_wbufsize > 0) { +	if (jbd2_has_feature_fast_commit(journal)) {  		journal->j_fc_last = be32_to_cpu(sb->s_maxlen); -		journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; +		num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); +		if (!num_fc_blocks) +			num_fc_blocks = JBD2_MIN_FC_BLOCKS; +		if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) +			journal->j_last = journal->j_fc_last - num_fc_blocks;  		journal->j_fc_first = journal->j_last + 1;  		journal->j_fc_off = 0; -	} else { -		journal->j_last = be32_to_cpu(sb->s_maxlen);  	}  	return 0; @@ -1899,7 +1883,7 @@ static int load_superblock(journal_t *journal)  /** - * int jbd2_journal_load() - Read journal from disk. + * jbd2_journal_load() - Read journal from disk.   * @journal: Journal to act on.   *   * Given a journal_t structure which tells us which disk blocks contain @@ -1954,9 +1938,6 @@ int jbd2_journal_load(journal_t *journal)  	 */  	journal->j_flags &= ~JBD2_ABORT; -	if (journal->j_fc_wbufsize > 0) -		jbd2_journal_set_features(journal, 0, 0, -					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT);  	/* OK, we've finished with the dynamic journal bits:  	 * reinitialise the dynamic contents of the superblock in memory  	 * and reset them on disk. */ @@ -1972,7 +1953,7 @@ recovery_error:  }  /** - * void jbd2_journal_destroy() - Release a journal_t structure. + * jbd2_journal_destroy() - Release a journal_t structure.   * @journal: Journal to act on.   *   * Release a journal_t structure once it is no longer in use by the @@ -2040,8 +2021,7 @@ int jbd2_journal_destroy(journal_t *journal)  		jbd2_journal_destroy_revoke(journal);  	if (journal->j_chksum_driver)  		crypto_free_shash(journal->j_chksum_driver); -	if (journal->j_fc_wbufsize > 0) -		kfree(journal->j_fc_wbuf); +	kfree(journal->j_fc_wbuf);  	kfree(journal->j_wbuf);  	kfree(journal); @@ -2050,7 +2030,7 @@ int jbd2_journal_destroy(journal_t *journal)  /** - *int jbd2_journal_check_used_features() - Check if features specified are used. + * jbd2_journal_check_used_features() - Check if features specified are used.   * @journal: Journal to check.   * @compat: bitmask of compatible features   * @ro: bitmask of features that force read-only mount @@ -2085,7 +2065,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,  }  /** - * int jbd2_journal_check_available_features() - Check feature set in journalling layer + * jbd2_journal_check_available_features() - Check feature set in journalling layer   * @journal: Journal to check.   * @compat: bitmask of compatible features   * @ro: bitmask of features that force read-only mount @@ -2116,8 +2096,39 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp  	return 0;  } +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ +	journal_superblock_t *sb = journal->j_superblock; +	unsigned long long num_fc_blks; + +	num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); +	if (num_fc_blks == 0) +		num_fc_blks = JBD2_MIN_FC_BLOCKS; +	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) +		return -ENOSPC; + +	/* Are we called twice? */ +	WARN_ON(journal->j_fc_wbuf != NULL); +	journal->j_fc_wbuf = kmalloc_array(num_fc_blks, +				sizeof(struct buffer_head *), GFP_KERNEL); +	if (!journal->j_fc_wbuf) +		return -ENOMEM; + +	journal->j_fc_wbufsize = num_fc_blks; +	journal->j_fc_last = journal->j_last; +	journal->j_last = journal->j_fc_last - num_fc_blks; +	journal->j_fc_first = journal->j_last + 1; +	journal->j_fc_off = 0; +	journal->j_free = journal->j_last - journal->j_first; +	journal->j_max_transaction_buffers = +		jbd2_journal_get_max_txn_bufs(journal); + +	return 0; +} +  /** - * int jbd2_journal_set_features() - Mark a given journal feature in the superblock + * jbd2_journal_set_features() - Mark a given journal feature in the superblock   * @journal: Journal to act on.   * @compat: bitmask of compatible features   * @ro: bitmask of features that force read-only mount @@ -2159,6 +2170,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,  	sb = journal->j_superblock; +	if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { +		if (jbd2_journal_initialize_fast_commit(journal)) { +			pr_err("JBD2: Cannot enable fast commits.\n"); +			return 0; +		} +	} +  	/* Load the checksum driver if necessary */  	if ((journal->j_chksum_driver == NULL) &&  	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { @@ -2201,7 +2219,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,  }  /* - * jbd2_journal_clear_features () - Clear a given journal feature in the + * jbd2_journal_clear_features() - Clear a given journal feature in the   * 				    superblock   * @journal: Journal to act on.   * @compat: bitmask of compatible features @@ -2230,7 +2248,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,  EXPORT_SYMBOL(jbd2_journal_clear_features);  /** - * int jbd2_journal_flush () - Flush journal + * jbd2_journal_flush() - Flush journal   * @journal: Journal to act on.   *   * Flush all data for a given journal to disk and empty the journal. @@ -2305,7 +2323,7 @@ out:  }  /** - * int jbd2_journal_wipe() - Wipe journal contents + * jbd2_journal_wipe() - Wipe journal contents   * @journal: Journal to act on.   * @write: flag (see below)   * @@ -2346,7 +2364,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)  }  /** - * void jbd2_journal_abort () - Shutdown the journal immediately. + * jbd2_journal_abort () - Shutdown the journal immediately.   * @journal: the journal to shutdown.   * @errno:   an error number to record in the journal indicating   *           the reason for the shutdown. @@ -2437,7 +2455,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)  }  /** - * int jbd2_journal_errno () - returns the journal's error state. + * jbd2_journal_errno() - returns the journal's error state.   * @journal: journal to examine.   *   * This is the errno number set with jbd2_journal_abort(), the last @@ -2461,7 +2479,7 @@ int jbd2_journal_errno(journal_t *journal)  }  /** - * int jbd2_journal_clear_err () - clears the journal's error state + * jbd2_journal_clear_err() - clears the journal's error state   * @journal: journal to act on.   *   * An error must be cleared or acked to take a FS out of readonly @@ -2481,7 +2499,7 @@ int jbd2_journal_clear_err(journal_t *journal)  }  /** - * void jbd2_journal_ack_err() - Ack journal err. + * jbd2_journal_ack_err() - Ack journal err.   * @journal: journal to act on.   *   * An error must be cleared or acked to take a FS out of readonly diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index eb2606133cd8..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start)  	/* Do up to 128K of readahead */  	max = start + (128 * 1024 / journal->j_blocksize); -	if (max > journal->j_maxlen) -		max = journal->j_maxlen; +	if (max > journal->j_total_len) +		max = journal->j_total_len;  	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at  	 * a time to the block device IO layer. */ @@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,  	*bhp = NULL; -	if (offset >= journal->j_maxlen) { +	if (offset >= journal->j_total_len) {  		printk(KERN_ERR "JBD2: corrupted journal superblock\n");  		return -EFSCORRUPTED;  	} diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 43985738aa86..9396666b7314 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal)  	DEFINE_WAIT(wait);  	if (WARN_ON(!journal->j_running_transaction || -		    journal->j_running_transaction->t_state != T_SWITCH)) +		    journal->j_running_transaction->t_state != T_SWITCH)) { +		read_unlock(&journal->j_state_lock);  		return; +	}  	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,  			TASK_UNINTERRUPTIBLE);  	read_unlock(&journal->j_state_lock); @@ -517,7 +519,7 @@ EXPORT_SYMBOL(jbd2__journal_start);  /** - * handle_t *jbd2_journal_start() - Obtain a new handle. + * jbd2_journal_start() - Obtain a new handle.   * @journal: Journal to start transaction on.   * @nblocks: number of block buffer we might modify   * @@ -564,7 +566,7 @@ void jbd2_journal_free_reserved(handle_t *handle)  EXPORT_SYMBOL(jbd2_journal_free_reserved);  /** - * int jbd2_journal_start_reserved() - start reserved handle + * jbd2_journal_start_reserved() - start reserved handle   * @handle: handle to start   * @type: for handle statistics   * @line_no: for handle statistics @@ -618,7 +620,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,  EXPORT_SYMBOL(jbd2_journal_start_reserved);  /** - * int jbd2_journal_extend() - extend buffer credits. + * jbd2_journal_extend() - extend buffer credits.   * @handle:  handle to 'extend'   * @nblocks: nr blocks to try to extend by.   * @revoke_records: number of revoke records to try to extend by. @@ -743,7 +745,7 @@ static void stop_this_handle(handle_t *handle)  }  /** - * int jbd2_journal_restart() - restart a handle . + * jbd2__journal_restart() - restart a handle .   * @handle:  handle to restart   * @nblocks: nr credits requested   * @revoke_records: number of revoke record credits requested @@ -813,7 +815,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)  EXPORT_SYMBOL(jbd2_journal_restart);  /** - * void jbd2_journal_lock_updates () - establish a transaction barrier. + * jbd2_journal_lock_updates () - establish a transaction barrier.   * @journal:  Journal to establish a barrier on.   *   * This locks out any further updates from being started, and blocks @@ -872,7 +874,7 @@ void jbd2_journal_lock_updates(journal_t *journal)  }  /** - * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier + * jbd2_journal_unlock_updates () - release barrier   * @journal:  Journal to release the barrier on.   *   * Release a transaction barrier obtained with jbd2_journal_lock_updates(). @@ -1180,7 +1182,8 @@ out:  }  /** - * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * jbd2_journal_get_write_access() - notify intent to modify a buffer + *				     for metadata (not data) update.   * @handle: transaction to add buffer modifications to   * @bh:     bh to be used for metadata writes   * @@ -1224,7 +1227,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)   * unlocked buffer beforehand. */  /** - * int jbd2_journal_get_create_access () - notify intent to use newly created bh + * jbd2_journal_get_create_access () - notify intent to use newly created bh   * @handle: transaction to new buffer to   * @bh: new buffer.   * @@ -1304,7 +1307,7 @@ out:  }  /** - * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with + * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with   *     non-rewindable consequences   * @handle: transaction   * @bh: buffer to undo @@ -1381,7 +1384,7 @@ out:  }  /** - * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * jbd2_journal_set_triggers() - Add triggers for commit writeout   * @bh: buffer to trigger on   * @type: struct jbd2_buffer_trigger_type containing the trigger(s).   * @@ -1423,7 +1426,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,  }  /** - * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata + * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata   * @handle: transaction to add buffer to.   * @bh: buffer to mark   * @@ -1591,7 +1594,7 @@ out:  }  /** - * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * jbd2_journal_forget() - bforget() for potentially-journaled buffers.   * @handle: transaction handle   * @bh:     bh to 'forget'   * @@ -1760,7 +1763,7 @@ drop:  }  /** - * int jbd2_journal_stop() - complete a transaction + * jbd2_journal_stop() - complete a transaction   * @handle: transaction to complete.   *   * All done for a particular handle. @@ -2078,7 +2081,7 @@ out:  }  /** - * int jbd2_journal_try_to_free_buffers() - try to free page buffers. + * jbd2_journal_try_to_free_buffers() - try to free page buffers.   * @journal: journal for operation   * @page: to try and free   * @@ -2409,7 +2412,7 @@ zap_buffer_unlocked:  }  /** - * void jbd2_journal_invalidatepage() + * jbd2_journal_invalidatepage()   * @journal: journal to use for flush...   * @page:    page to flush   * @offset:  start of the range to invalidate diff --git a/fs/libfs.c b/fs/libfs.c index fc34361c1489..7124c2e8df2f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -959,7 +959,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,  			  size_t len, loff_t *ppos)  {  	struct simple_attr *attr; -	u64 val; +	unsigned long long val;  	size_t size;  	ssize_t ret; @@ -977,7 +977,9 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,  		goto out;  	attr->set_buf[size] = '\0'; -	val = simple_strtoll(attr->set_buf, NULL, 0); +	ret = kstrtoull(attr->set_buf, 0, &val); +	if (ret) +		goto out;  	ret = attr->set(attr->data, val);  	if (ret == 0)  		ret = len; /* on success, claim we got the whole input */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cb52db9a0cfb..4e011adaf967 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -955,7 +955,6 @@ out:  static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  { -	struct inode *inode = file_inode(filp);  	struct nfs_open_dir_context *dir_ctx = filp->private_data;  	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", @@ -967,15 +966,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  	case SEEK_SET:  		if (offset < 0)  			return -EINVAL; -		inode_lock(inode); +		spin_lock(&filp->f_lock);  		break;  	case SEEK_CUR:  		if (offset == 0)  			return filp->f_pos; -		inode_lock(inode); +		spin_lock(&filp->f_lock);  		offset += filp->f_pos;  		if (offset < 0) { -			inode_unlock(inode); +			spin_unlock(&filp->f_lock);  			return -EINVAL;  		}  	} @@ -987,7 +986,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  			dir_ctx->dir_cookie = 0;  		dir_ctx->duped = 0;  	} -	inode_unlock(inode); +	spin_unlock(&filp->f_lock);  	return offset;  } @@ -998,13 +997,9 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,  			 int datasync)  { -	struct inode *inode = file_inode(filp); -  	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); -	inode_lock(inode); -	nfs_inc_stats(inode, NFSIOS_VFSFSYNC); -	inode_unlock(inode); +	nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC);  	return 0;  } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index b51424ff8159..6c2ce799150f 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1047,8 +1047,10 @@ out4:  void nfs4_xattr_cache_exit(void)  { +	unregister_shrinker(&nfs4_xattr_large_entry_shrinker);  	unregister_shrinker(&nfs4_xattr_entry_shrinker);  	unregister_shrinker(&nfs4_xattr_cache_shrinker); +	list_lru_destroy(&nfs4_xattr_large_entry_lru);  	list_lru_destroy(&nfs4_xattr_entry_lru);  	list_lru_destroy(&nfs4_xattr_cache_lru);  	kmem_cache_destroy(nfs4_xattr_cache_cachep); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0dc31ad2362e..6e060a88f98c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -196,7 +196,7 @@  				 1 + nfs4_xattr_name_maxsz + 1)  #define decode_setxattr_maxsz   (op_decode_hdr_maxsz + decode_change_info_maxsz)  #define encode_listxattrs_maxsz  (op_encode_hdr_maxsz + 2 + 1) -#define decode_listxattrs_maxsz  (op_decode_hdr_maxsz + 2 + 1 + 1) +#define decode_listxattrs_maxsz  (op_decode_hdr_maxsz + 2 + 1 + 1 + 1)  #define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \  				  nfs4_xattr_name_maxsz)  #define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ @@ -531,7 +531,7 @@ static void encode_listxattrs(struct xdr_stream *xdr,  {  	__be32 *p; -	encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr); +	encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr);  	p = reserve_space(xdr, 12);  	if (unlikely(!p)) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 8d3278805602..fa148308822c 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -88,7 +88,13 @@  #define NFS_ROOT		"/tftpboot/%s"  /* Default NFSROOT mount options. */ +#if defined(CONFIG_NFS_V2)  #define NFS_DEF_OPTIONS		"vers=2,tcp,rsize=4096,wsize=4096" +#elif defined(CONFIG_NFS_V3) +#define NFS_DEF_OPTIONS		"vers=3,tcp,rsize=4096,wsize=4096" +#else +#define NFS_DEF_OPTIONS		"vers=4,tcp,rsize=4096,wsize=4096" +#endif  /* Parameters passed from the kernel command line */  static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 14468613d150..a633044b0dc1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)  	fh_copy(&resp->dirfh, &argp->fh);  	fh_init(&resp->fh, NFS3_FHSIZE); -	if (argp->ftype == 0 || argp->ftype >= NF3BAD) { -		resp->status = nfserr_inval; -		goto out; -	}  	if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {  		rdev = MKDEV(argp->major, argp->minor);  		if (MAJOR(rdev) != argp->major || @@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)  			goto out;  		}  	} else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { -		resp->status = nfserr_inval; +		resp->status = nfserr_badtype;  		goto out;  	} diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9c23b6acf234..2277f83da250 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)  {  	struct nfsd3_pathconfres *resp = rqstp->rq_resp; +	*p++ = resp->status;  	*p++ = xdr_zero;	/* no post_op_attr */  	if (resp->status == 0) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ad2fa1a8e7ad..e83b21778816 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,  			struct nfsd_file *dst)  {  	nfs42_ssc_close(src->nf_file); -	nfsd_file_put(src); +	/* 'src' is freed by nfsd4_do_async_copy */  	nfsd_file_put(dst);  	mntput(ss_mnt);  } @@ -1486,6 +1486,7 @@ do_callback:  	cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);  	if (!cb_copy)  		goto out; +	refcount_set(&cb_copy->refcount, 1);  	memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res));  	cb_copy->cp_clp = copy->cp_clp;  	cb_copy->nfserr = copy->nfserr; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index a960ec3a569a..8d3ad5ef2925 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -178,6 +178,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,  	struct inode *inode = d_inode(dentry);  	struct dentry *parent;  	bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; +	bool parent_needed, parent_interested;  	__u32 p_mask;  	struct inode *p_inode = NULL;  	struct name_snapshot name; @@ -193,7 +194,8 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,  		return 0;  	parent = NULL; -	if (!parent_watched && !fsnotify_event_needs_parent(inode, mnt, mask)) +	parent_needed = fsnotify_event_needs_parent(inode, mnt, mask); +	if (!parent_watched && !parent_needed)  		goto notify;  	/* Does parent inode care about events on children? */ @@ -205,17 +207,17 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,  	/*  	 * Include parent/name in notification either if some notification -	 * groups require parent info (!parent_watched case) or the parent is -	 * interested in this event. +	 * groups require parent info or the parent is interested in this event.  	 */ -	if (!parent_watched || (mask & p_mask & ALL_FSNOTIFY_EVENTS)) { +	parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; +	if (parent_needed || parent_interested) {  		/* When notifying parent, child should be passed as data */  		WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));  		/* Notify both parent and child with child name info */  		take_dentry_name_snapshot(&name, dentry);  		file_name = &name.name; -		if (parent_watched) +		if (parent_interested)  			mask |= FS_EVENT_ON_CHILD;  	} diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9a9d69dde7e..db52e843002a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)  		goto done;  	} -	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); +	trace_ocfs2_journal_init_maxlen(j_journal->j_total_len);  	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &  		  OCFS2_JOURNAL_DIRTY_FL); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1d91dd1e8711..2febc76e9de7 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1713,6 +1713,7 @@ static void ocfs2_inode_init_once(void *data)  	oi->ip_blkno = 0ULL;  	oi->ip_clusters = 0; +	oi->ip_next_orphan = NULL;  	ocfs2_resv_init_once(&oi->ip_la_data_resv); diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f707003dda5..b362523a9829 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1049,6 +1049,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,  		oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /  			  OOM_SCORE_ADJ_MAX;  	put_task_struct(task); +	if (oom_adj > OOM_ADJUST_MAX) +		oom_adj = OOM_ADJUST_MAX;  	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);  	return simple_read_from_buffer(buf, count, ppos, buffer, len);  } diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index d0989a443c77..419760fd77bd 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)  static const struct proc_ops cpuinfo_proc_ops = {  	.proc_flags	= PROC_ENTRY_PERMANENT,  	.proc_open	= cpuinfo_open, -	.proc_read	= seq_read, +	.proc_read_iter	= seq_read_iter,  	.proc_lseek	= seq_lseek,  	.proc_release	= seq_release,  }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2f9fa179194d..b84663252add 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)  static const struct proc_ops proc_seq_ops = {  	/* not permanent -- can call into arbitrary seq_operations */  	.proc_open	= proc_seq_open, -	.proc_read	= seq_read, +	.proc_read_iter	= seq_read_iter,  	.proc_lseek	= seq_lseek,  	.proc_release	= proc_seq_release,  }; @@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file)  static const struct proc_ops proc_single_ops = {  	/* not permanent -- can call into arbitrary ->single_show */  	.proc_open	= proc_single_open, -	.proc_read	= seq_read, +	.proc_read_iter = seq_read_iter,  	.proc_lseek	= seq_lseek,  	.proc_release	= single_release,  }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 58c075e2a452..bde6b6f69852 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = {  	.llseek		= proc_reg_llseek,  	.read_iter	= proc_reg_read_iter,  	.write		= proc_reg_write, +	.splice_read	= generic_file_splice_read,  	.poll		= proc_reg_poll,  	.unlocked_ioctl	= proc_reg_unlocked_ioctl,  	.mmap		= proc_reg_mmap, @@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = {  static const struct file_operations proc_iter_file_ops_compat = {  	.llseek		= proc_reg_llseek,  	.read_iter	= proc_reg_read_iter, +	.splice_read	= generic_file_splice_read,  	.write		= proc_reg_write,  	.poll		= proc_reg_poll,  	.unlocked_ioctl	= proc_reg_unlocked_ioctl, diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..cc71ce3466dc 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,6 +16,13 @@ static const char *proc_self_get_link(struct dentry *dentry,  	pid_t tgid = task_tgid_nr_ns(current, ns);  	char *name; +	/* +	 * Not currently supported. Once we can inherit all of struct pid, +	 * we can allow this. +	 */ +	if (current->flags & PF_KTHREAD) +		return ERR_PTR(-EOPNOTSUPP); +  	if (!tgid)  		return ERR_PTR(-ENOENT);  	/* max length of unsigned int in decimal + NULL term */ diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 46b3293015fe..4695b6de3151 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file)  static const struct proc_ops stat_proc_ops = {  	.proc_flags	= PROC_ENTRY_PERMANENT,  	.proc_open	= stat_open, -	.proc_read	= seq_read, +	.proc_read_iter	= seq_read_iter,  	.proc_lseek	= seq_lseek,  	.proc_release	= single_release,  }; diff --git a/fs/select.c b/fs/select.c index 7aef49552d4c..ebfebdfe5c69 100644 --- a/fs/select.c +++ b/fs/select.c @@ -97,7 +97,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv)  struct poll_table_page {  	struct poll_table_page * next;  	struct poll_table_entry * entry; -	struct poll_table_entry entries[0]; +	struct poll_table_entry entries[];  };  #define POLL_TABLE_FULL(table) \ @@ -836,7 +836,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)  struct poll_list {  	struct poll_list *next;  	int len; -	struct pollfd entries[0]; +	struct pollfd entries[];  };  #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --git a/fs/seq_file.c b/fs/seq_file.c index 31219c1db17d..3b20e21604e7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -18,6 +18,7 @@  #include <linux/mm.h>  #include <linux/printk.h>  #include <linux/string_helpers.h> +#include <linux/uio.h>  #include <linux/uaccess.h>  #include <asm/page.h> @@ -146,7 +147,28 @@ Eoverflow:   */  ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  { -	struct seq_file *m = file->private_data; +	struct iovec iov = { .iov_base = buf, .iov_len = size}; +	struct kiocb kiocb; +	struct iov_iter iter; +	ssize_t ret; + +	init_sync_kiocb(&kiocb, file); +	iov_iter_init(&iter, READ, &iov, 1, size); + +	kiocb.ki_pos = *ppos; +	ret = seq_read_iter(&kiocb, &iter); +	*ppos = kiocb.ki_pos; +	return ret; +} +EXPORT_SYMBOL(seq_read); + +/* + * Ready-made ->f_op->read_iter() + */ +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ +	struct seq_file *m = iocb->ki_filp->private_data; +	size_t size = iov_iter_count(iter);  	size_t copied = 0;  	size_t n;  	void *p; @@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  	 * if request is to read from zero offset, reset iterator to first  	 * record as it might have been already advanced by previous requests  	 */ -	if (*ppos == 0) { +	if (iocb->ki_pos == 0) {  		m->index = 0;  		m->count = 0;  	} -	/* Don't assume *ppos is where we left it */ -	if (unlikely(*ppos != m->read_pos)) { -		while ((err = traverse(m, *ppos)) == -EAGAIN) +	/* Don't assume ki_pos is where we left it */ +	if (unlikely(iocb->ki_pos != m->read_pos)) { +		while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)  			;  		if (err) {  			/* With prejudice... */ @@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  			m->count = 0;  			goto Done;  		} else { -			m->read_pos = *ppos; +			m->read_pos = iocb->ki_pos;  		}  	} @@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  	/* if not empty - flush it first */  	if (m->count) {  		n = min(m->count, size); -		err = copy_to_user(buf, m->buf + m->from, n); -		if (err) +		if (copy_to_iter(m->buf + m->from, n, iter) != n)  			goto Efault;  		m->count -= n;  		m->from += n;  		size -= n; -		buf += n;  		copied += n;  		if (!size)  			goto Done; @@ -254,8 +274,7 @@ Fill:  	}  	m->op->stop(m, p);  	n = min(m->count, size); -	err = copy_to_user(buf, m->buf, n); -	if (err) +	if (copy_to_iter(m->buf, n, iter) != n)  		goto Efault;  	copied += n;  	m->count -= n; @@ -264,7 +283,7 @@ Done:  	if (!copied)  		copied = err;  	else { -		*ppos += copied; +		iocb->ki_pos += copied;  		m->read_pos += copied;  	}  	mutex_unlock(&m->lock); @@ -276,7 +295,7 @@ Efault:  	err = -EFAULT;  	goto Done;  } -EXPORT_SYMBOL(seq_read); +EXPORT_SYMBOL(seq_read_iter);  /**   *	seq_lseek -	->llseek() method for sequential files. diff --git a/fs/super.c b/fs/super.c index a51c2083cd6b..98bb0629ee10 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1631,55 +1631,6 @@ int super_setup_bdi(struct super_block *sb)  }  EXPORT_SYMBOL(super_setup_bdi); -/* - * This is an internal function, please use sb_end_{write,pagefault,intwrite} - * instead. - */ -void __sb_end_write(struct super_block *sb, int level) -{ -	percpu_up_read(sb->s_writers.rw_sem + level-1); -} -EXPORT_SYMBOL(__sb_end_write); - -/* - * This is an internal function, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -int __sb_start_write(struct super_block *sb, int level, bool wait) -{ -	bool force_trylock = false; -	int ret = 1; - -#ifdef CONFIG_LOCKDEP -	/* -	 * We want lockdep to tell us about possible deadlocks with freezing -	 * but it's it bit tricky to properly instrument it. Getting a freeze -	 * protection works as getting a read lock but there are subtle -	 * problems. XFS for example gets freeze protection on internal level -	 * twice in some cases, which is OK only because we already hold a -	 * freeze protection also on higher level. Due to these cases we have -	 * to use wait == F (trylock mode) which must not fail. -	 */ -	if (wait) { -		int i; - -		for (i = 0; i < level - 1; i++) -			if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { -				force_trylock = true; -				break; -			} -	} -#endif -	if (wait && !force_trylock) -		percpu_down_read(sb->s_writers.rw_sem + level-1); -	else -		ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - -	WARN_ON(force_trylock && !ret); -	return ret; -} -EXPORT_SYMBOL(__sb_start_write); -  /**   * sb_wait_write - wait until all writers to given file system finish   * @sb: the super for which we wait diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 852b536551b5..15640015be9d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2467,6 +2467,7 @@ xfs_defer_agfl_block(  	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);  	new->xefi_blockcount = 1;  	new->xefi_oinfo = *oinfo; +	new->xefi_skip_discard = false;  	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index bb128db220ac..d6ef69ab1c67 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -515,7 +515,7 @@ xfs_attr_copy_value(   *========================================================================*/  /* - * Query whether the requested number of additional bytes of extended + * Query whether the total requested number of attr fork bytes of extended   * attribute space will be able to fit inline.   *   * Returns zero if not, else the di_forkoff fork offset to be used in the @@ -535,6 +535,12 @@ xfs_attr_shortform_bytesfit(  	int			maxforkoff;  	int			offset; +	/* +	 * Check if the new size could fit at all first: +	 */ +	if (bytes > XFS_LITINO(mp)) +		return 0; +  	/* rounded down */  	offset = (XFS_LITINO(mp) - bytes) >> 3; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e1bd484e5548..6747e97a7949 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -52,9 +52,9 @@ struct xfs_extent_free_item  {  	xfs_fsblock_t		xefi_startblock;/* starting fs block number */  	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */ +	bool			xefi_skip_discard;  	struct list_head	xefi_list;  	struct xfs_owner_info	xefi_oinfo;	/* extent owner */ -	bool			xefi_skip_discard;  };  #define	XFS_BMAP_MAX_NMAP	4 diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 340c83f76c80..2668ebe02865 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -1514,7 +1514,7 @@ xfs_rmap_convert_shared(  	 * record for our insertion point. This will also give us the record for  	 * start block contiguity tests.  	 */ -	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, +	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext,  			&PREV, &i);  	if (error)  		goto done; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 955302e7cdde..fed56d213a3f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -113,6 +113,8 @@ xchk_bmap_get_rmap(  	if (info->whichfork == XFS_ATTR_FORK)  		rflags |= XFS_RMAP_ATTR_FORK; +	if (irec->br_state == XFS_EXT_UNWRITTEN) +		rflags |= XFS_RMAP_UNWRITTEN;  	/*  	 * CoW staging extents are owned (on disk) by the refcountbt, so @@ -216,13 +218,13 @@ xchk_bmap_xref_rmap(  	 * which doesn't track unwritten state.  	 */  	if (owner != XFS_RMAP_OWN_COW && -	    irec->br_state == XFS_EXT_UNWRITTEN && -	    !(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) +	    !!(irec->br_state == XFS_EXT_UNWRITTEN) != +	    !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))  		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,  				irec->br_startoff); -	if (info->whichfork == XFS_ATTR_FORK && -	    !(rmap.rm_flags & XFS_RMAP_ATTR_FORK)) +	if (!!(info->whichfork == XFS_ATTR_FORK) != +	    !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK))  		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,  				irec->br_startoff);  	if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index f52a7b8256f9..debf392e0515 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -452,32 +452,41 @@ xchk_btree_check_minrecs(  	int			level,  	struct xfs_btree_block	*block)  { -	unsigned int		numrecs; -	int			ok_level; - -	numrecs = be16_to_cpu(block->bb_numrecs); +	struct xfs_btree_cur	*cur = bs->cur; +	unsigned int		root_level = cur->bc_nlevels - 1; +	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);  	/* More records than minrecs means the block is ok. */ -	if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level)) +	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))  		return;  	/* -	 * Certain btree blocks /can/ have fewer than minrecs records.  Any -	 * level greater than or equal to the level of the highest dedicated -	 * btree block are allowed to violate this constraint. -	 * -	 * For a btree rooted in a block, the btree root can have fewer than -	 * minrecs records.  If the btree is rooted in an inode and does not -	 * store records in the root, the direct children of the root and the -	 * root itself can have fewer than minrecs records. +	 * For btrees rooted in the inode, it's possible that the root block +	 * contents spilled into a regular ondisk block because there wasn't +	 * enough space in the inode root.  The number of records in that +	 * child block might be less than the standard minrecs, but that's ok +	 * provided that there's only one direct child of the root.  	 */ -	ok_level = bs->cur->bc_nlevels - 1; -	if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) -		ok_level--; -	if (level >= ok_level) +	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && +	    level == cur->bc_nlevels - 2) { +		struct xfs_btree_block	*root_block; +		struct xfs_buf		*root_bp; +		int			root_maxrecs; + +		root_block = xfs_btree_get_block(cur, root_level, &root_bp); +		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level); +		if (be16_to_cpu(root_block->bb_numrecs) != 1 || +		    numrecs <= root_maxrecs) +			xchk_btree_set_corrupt(bs->sc, cur, level);  		return; +	} -	xchk_btree_set_corrupt(bs->sc, bs->cur, level); +	/* +	 * Otherwise, only the root level is allowed to have fewer than minrecs +	 * records or keyptrs. +	 */ +	if (level < root_level) +		xchk_btree_set_corrupt(bs->sc, cur, level);  }  /* diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 7c432997edad..b045e95c2ea7 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -558,14 +558,27 @@ xchk_directory_leaf1_bestfree(  	/* Check all the bestfree entries. */  	for (i = 0; i < bestcount; i++, bestp++) {  		best = be16_to_cpu(*bestp); -		if (best == NULLDATAOFF) -			continue;  		error = xfs_dir3_data_read(sc->tp, sc->ip, -				i * args->geo->fsbcount, 0, &dbp); +				xfs_dir2_db_to_da(args->geo, i), +				XFS_DABUF_MAP_HOLE_OK, +				&dbp);  		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,  				&error))  			break; -		xchk_directory_check_freesp(sc, lblk, dbp, best); + +		if (!dbp) { +			if (best != NULLDATAOFF) { +				xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, +						lblk); +				break; +			} +			continue; +		} + +		if (best == NULLDATAOFF) +			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +		else +			xchk_directory_check_freesp(sc, lblk, dbp, best);  		xfs_trans_brelse(sc->tp, dbp);  		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)  			break; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3aa85b64de36..bb25ff1b770d 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -121,8 +121,7 @@ xchk_inode_flags(  		goto bad;  	/* rt flags require rt device */ -	if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) && -	    !mp->m_rtdev_targp) +	if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)  		goto bad;  	/* new rt bitmap flag only valid for rbmino */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index beaeb6fa3119..dd672e6bbc75 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -170,7 +170,6 @@ xchk_refcountbt_process_rmap_fragments(  	 */  	INIT_LIST_HEAD(&worklist);  	rbno = NULLAGBLOCK; -	nr = 1;  	/* Make sure the fragments actually /are/ in agbno order. */  	bno = 0; @@ -184,15 +183,14 @@ xchk_refcountbt_process_rmap_fragments(  	 * Find all the rmaps that start at or before the refc extent,  	 * and put them on the worklist.  	 */ +	nr = 0;  	list_for_each_entry_safe(frag, n, &refchk->fragments, list) { -		if (frag->rm.rm_startblock > refchk->bno) -			goto done; +		if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) +			break;  		bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;  		if (bno < rbno)  			rbno = bno;  		list_move_tail(&frag->list, &worklist); -		if (nr == target_nr) -			break;  		nr++;  	} diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 55d126d4e096..4304c6416fbb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -346,8 +346,8 @@ xfs_map_blocks(  	ssize_t			count = i_blocksize(inode);  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);  	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count); -	xfs_fileoff_t		cow_fsb = NULLFILEOFF; -	int			whichfork = XFS_DATA_FORK; +	xfs_fileoff_t		cow_fsb; +	int			whichfork;  	struct xfs_bmbt_irec	imap;  	struct xfs_iext_cursor	icur;  	int			retries = 0; @@ -381,6 +381,8 @@ xfs_map_blocks(  	 * landed in a hole and we skip the block.  	 */  retry: +	cow_fsb = NULLFILEOFF; +	whichfork = XFS_DATA_FORK;  	xfs_ilock(ip, XFS_ILOCK_SHARED);  	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||  	       (ip->i_df.if_flags & XFS_IFEXTENTS)); @@ -527,13 +529,15 @@ xfs_prepare_ioend(   */  static void  xfs_discard_page( -	struct page		*page) +	struct page		*page, +	loff_t			fileoff)  {  	struct inode		*inode = page->mapping->host;  	struct xfs_inode	*ip = XFS_I(inode);  	struct xfs_mount	*mp = ip->i_mount; -	loff_t			offset = page_offset(page); -	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset); +	unsigned int		pageoff = offset_in_page(fileoff); +	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, fileoff); +	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);  	int			error;  	if (XFS_FORCED_SHUTDOWN(mp)) @@ -541,14 +545,14 @@ xfs_discard_page(  	xfs_alert_ratelimited(mp,  		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", -			page, ip->i_ino, offset); +			page, ip->i_ino, fileoff);  	error = xfs_bmap_punch_delalloc_range(ip, start_fsb, -			i_blocks_per_page(inode, page)); +			i_blocks_per_page(inode, page) - pageoff_fsb);  	if (error && !XFS_FORCED_SHUTDOWN(mp))  		xfs_alert(mp, "page discard unable to remove delalloc mapping.");  out_invalidate: -	iomap_invalidatepage(page, 0, PAGE_SIZE); +	iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);  }  static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3abb8b9d6f4c..7b9ff824e82d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -706,6 +706,23 @@ relock:  	return 0;  } +/* + * Check that the imap we are going to return to the caller spans the entire + * range that the caller requested for the IO. + */ +static bool +imap_spans_range( +	struct xfs_bmbt_irec	*imap, +	xfs_fileoff_t		offset_fsb, +	xfs_fileoff_t		end_fsb) +{ +	if (imap->br_startoff > offset_fsb) +		return false; +	if (imap->br_startoff + imap->br_blockcount < end_fsb) +		return false; +	return true; +} +  static int  xfs_direct_write_iomap_begin(  	struct inode		*inode, @@ -766,6 +783,18 @@ xfs_direct_write_iomap_begin(  	if (imap_needs_alloc(inode, flags, &imap, nimaps))  		goto allocate_blocks; +	/* +	 * NOWAIT IO needs to span the entire requested IO with a single map so +	 * that we avoid partial IO failures due to the rest of the IO range not +	 * covered by this map triggering an EAGAIN condition when it is +	 * subsequently mapped and aborting the IO. +	 */ +	if ((flags & IOMAP_NOWAIT) && +	    !imap_spans_range(&imap, offset_fsb, end_fsb)) { +		error = -EAGAIN; +		goto out_unlock; +	} +  	xfs_iunlock(ip, lockmode);  	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);  	return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 5e165456da68..1414ab79eacf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -911,6 +911,16 @@ xfs_setattr_size(  		error = iomap_zero_range(inode, oldsize, newsize - oldsize,  				&did_zeroing, &xfs_buffered_write_iomap_ops);  	} else { +		/* +		 * iomap won't detect a dirty page over an unwritten block (or a +		 * cow block over a hole) and subsequently skips zeroing the +		 * newly post-EOF portion of the page. Flush the new EOF to +		 * convert the block before the pagecache truncate. +		 */ +		error = filemap_write_and_wait_range(inode->i_mapping, newsize, +						     newsize); +		if (error) +			return error;  		error = iomap_truncate_page(inode, newsize, &did_zeroing,  				&xfs_buffered_write_iomap_ops);  	} diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 233dcc8784db..2a45138831e3 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -55,6 +55,9 @@ struct xfs_iwalk_ag {  	/* Where do we start the traversal? */  	xfs_ino_t			startino; +	/* What was the last inode number we saw when iterating the inobt? */ +	xfs_ino_t			lastino; +  	/* Array of inobt records we cache. */  	struct xfs_inobt_rec_incore	*recs; @@ -301,6 +304,9 @@ xfs_iwalk_ag_start(  	if (XFS_IS_CORRUPT(mp, *has_more != 1))  		return -EFSCORRUPTED; +	iwag->lastino = XFS_AGINO_TO_INO(mp, agno, +				irec->ir_startino + XFS_INODES_PER_CHUNK - 1); +  	/*  	 * If the LE lookup yielded an inobt record before the cursor position,  	 * skip it and see if there's another one after it. @@ -347,15 +353,17 @@ xfs_iwalk_run_callbacks(  	struct xfs_mount		*mp = iwag->mp;  	struct xfs_trans		*tp = iwag->tp;  	struct xfs_inobt_rec_incore	*irec; -	xfs_agino_t			restart; +	xfs_agino_t			next_agino;  	int				error; +	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; +  	ASSERT(iwag->nr_recs > 0);  	/* Delete cursor but remember the last record we cached... */  	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);  	irec = &iwag->recs[iwag->nr_recs - 1]; -	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; +	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);  	error = xfs_iwalk_ag_recs(iwag);  	if (error) @@ -372,7 +380,7 @@ xfs_iwalk_run_callbacks(  	if (error)  		return error; -	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); +	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);  }  /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ @@ -396,6 +404,7 @@ xfs_iwalk_ag(  	while (!error && has_more) {  		struct xfs_inobt_rec_incore	*irec; +		xfs_ino_t			rec_fsino;  		cond_resched();  		if (xfs_pwork_want_abort(&iwag->pwork)) @@ -407,6 +416,15 @@ xfs_iwalk_ag(  		if (error || !has_more)  			break; +		/* Make sure that we always move forward. */ +		rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); +		if (iwag->lastino != NULLFSINO && +		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { +			error = -EFSCORRUPTED; +			goto out; +		} +		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; +  		/* No allocated inodes in this chunk; skip it. */  		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {  			error = xfs_btree_increment(cur, 0, &has_more); @@ -535,6 +553,7 @@ xfs_iwalk(  		.trim_start	= 1,  		.skip_empty	= 1,  		.pwork		= XFS_PWORK_SINGLE_THREADED, +		.lastino	= NULLFSINO,  	};  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);  	int			error; @@ -623,6 +642,7 @@ xfs_iwalk_threaded(  		iwag->data = data;  		iwag->startino = startino;  		iwag->sz_recs = xfs_iwalk_prefetch(inode_records); +		iwag->lastino = NULLFSINO;  		xfs_pwork_queue(&pctl, &iwag->pwork);  		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);  		if (flags & XFS_INOBT_WALK_SAME_AG) @@ -696,6 +716,7 @@ xfs_inobt_walk(  		.startino	= startino,  		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),  		.pwork		= XFS_PWORK_SINGLE_THREADED, +		.lastino	= NULLFSINO,  	};  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);  	int			error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 150ee5cb8645..7110507a2b6b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -194,20 +194,25 @@ xfs_initialize_perag(  		}  		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); -		if (!pag) +		if (!pag) { +			error = -ENOMEM;  			goto out_unwind_new_pags; +		}  		pag->pag_agno = index;  		pag->pag_mount = mp;  		spin_lock_init(&pag->pag_ici_lock);  		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); -		if (xfs_buf_hash_init(pag)) + +		error = xfs_buf_hash_init(pag); +		if (error)  			goto out_free_pag;  		init_waitqueue_head(&pag->pagb_wait);  		spin_lock_init(&pag->pagb_lock);  		pag->pagb_count = 0;  		pag->pagb_tree = RB_ROOT; -		if (radix_tree_preload(GFP_NOFS)) +		error = radix_tree_preload(GFP_NOFS); +		if (error)  			goto out_hash_destroy;  		spin_lock(&mp->m_perag_lock); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index b101feb2aab4..f3082a957d5e 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -134,7 +134,7 @@ xfs_fs_map_blocks(  		goto out_unlock;  	error = invalidate_inode_pages2(inode->i_mapping);  	if (WARN_ON_ONCE(error)) -		return error; +		goto out_unlock;  	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);  	offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 16098dc42add..6fa05fb78189 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1502,7 +1502,8 @@ xfs_reflink_unshare(  			&xfs_buffered_write_iomap_ops);  	if (error)  		goto out; -	error = filemap_write_and_wait(inode->i_mapping); + +	error = filemap_write_and_wait_range(inode->i_mapping, offset, len);  	if (error)  		goto out; | 
