diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 116 | 
1 files changed, 89 insertions, 27 deletions
| diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..f5392ab2def1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -242,19 +242,30 @@ xfs_file_fsync(  	}  	/* -	 * All metadata updates are logged, which means that we just have -	 * to flush the log up to the latest LSN that touched the inode. +	 * All metadata updates are logged, which means that we just have to +	 * flush the log up to the latest LSN that touched the inode. If we have +	 * concurrent fsync/fdatasync() calls, we need them to all block on the +	 * log force before we clear the ili_fsync_fields field. This ensures +	 * that we don't get a racing sync operation that does not wait for the +	 * metadata to hit the journal before returning. If we race with +	 * clearing the ili_fsync_fields, then all that will happen is the log +	 * force will do nothing as the lsn will already be on disk. We can't +	 * race with setting ili_fsync_fields because that is done under +	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared +	 * until after the ili_fsync_fields is cleared.  	 */  	xfs_ilock(ip, XFS_ILOCK_SHARED);  	if (xfs_ipincount(ip)) {  		if (!datasync || -		    (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) +		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))  			lsn = ip->i_itemp->ili_last_lsn;  	} -	xfs_iunlock(ip, XFS_ILOCK_SHARED); -	if (lsn) +	if (lsn) {  		error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); +		ip->i_itemp->ili_fsync_fields = 0; +	} +	xfs_iunlock(ip, XFS_ILOCK_SHARED);  	/*  	 * If we only have a single device, and the log force about was @@ -287,7 +298,7 @@ xfs_file_read_iter(  	xfs_fsize_t		n;  	loff_t			pos = iocb->ki_pos; -	XFS_STATS_INC(xs_read_calls); +	XFS_STATS_INC(mp, xs_read_calls);  	if (unlikely(iocb->ki_flags & IOCB_DIRECT))  		ioflags |= XFS_IO_ISDIRECT; @@ -365,7 +376,7 @@ xfs_file_read_iter(  	ret = generic_file_read_iter(iocb, to);  	if (ret > 0) -		XFS_STATS_ADD(xs_read_bytes, ret); +		XFS_STATS_ADD(mp, xs_read_bytes, ret);  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);  	return ret; @@ -383,7 +394,7 @@ xfs_file_splice_read(  	int			ioflags = 0;  	ssize_t			ret; -	XFS_STATS_INC(xs_read_calls); +	XFS_STATS_INC(ip->i_mount, xs_read_calls);  	if (infilp->f_mode & FMODE_NOCMTIME)  		ioflags |= XFS_IO_INVIS; @@ -401,7 +412,7 @@ xfs_file_splice_read(  	else  		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);  	if (ret > 0) -		XFS_STATS_ADD(xs_read_bytes, ret); +		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);  	return ret; @@ -482,6 +493,8 @@ xfs_zero_eof(  	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));  	ASSERT(offset > isize); +	trace_xfs_zero_eof(ip, isize, offset - isize); +  	/*  	 * First handle zeroing the block on which isize resides.  	 * @@ -574,6 +587,7 @@ xfs_file_aio_write_checks(  	struct xfs_inode	*ip = XFS_I(inode);  	ssize_t			error = 0;  	size_t			count = iov_iter_count(from); +	bool			drained_dio = false;  restart:  	error = generic_write_checks(iocb, from); @@ -611,12 +625,13 @@ restart:  		bool	zero = false;  		spin_unlock(&ip->i_flags_lock); -		if (*iolock == XFS_IOLOCK_SHARED) { -			xfs_rw_iunlock(ip, *iolock); -			*iolock = XFS_IOLOCK_EXCL; -			xfs_rw_ilock(ip, *iolock); -			iov_iter_reexpand(from, count); - +		if (!drained_dio) { +			if (*iolock == XFS_IOLOCK_SHARED) { +				xfs_rw_iunlock(ip, *iolock); +				*iolock = XFS_IOLOCK_EXCL; +				xfs_rw_ilock(ip, *iolock); +				iov_iter_reexpand(from, count); +			}  			/*  			 * We now have an IO submission barrier in place, but  			 * AIO can do EOF updates during IO completion and hence @@ -626,6 +641,7 @@ restart:  			 * no-op.  			 */  			inode_dio_wait(inode); +			drained_dio = true;  			goto restart;  		}  		error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); @@ -867,7 +883,7 @@ xfs_file_write_iter(  	ssize_t			ret;  	size_t			ocount = iov_iter_count(from); -	XFS_STATS_INC(xs_write_calls); +	XFS_STATS_INC(ip->i_mount, xs_write_calls);  	if (ocount == 0)  		return 0; @@ -883,7 +899,7 @@ xfs_file_write_iter(  	if (ret > 0) {  		ssize_t err; -		XFS_STATS_ADD(xs_write_bytes, ret); +		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);  		/* Handle various SYNC-type writes */  		err = generic_write_sync(file, iocb->ki_pos - ret, ret); @@ -1477,7 +1493,7 @@ xfs_file_llseek(   *   * mmap_sem (MM)   *   sb_start_pagefault(vfs, freeze) - *     i_mmap_lock (XFS - truncate serialisation) + *     i_mmaplock (XFS - truncate serialisation)   *       page_lock (MM)   *         i_lock (XFS - extent map serialisation)   */ @@ -1503,10 +1519,9 @@ xfs_filemap_page_mkwrite(  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);  	if (IS_DAX(inode)) { -		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, -				    xfs_end_io_dax_write); +		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);  	} else { -		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); +		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);  		ret = block_page_mkwrite_return(ret);  	} @@ -1538,7 +1553,7 @@ xfs_filemap_fault(  		 * changes to xfs_get_blocks_direct() to map unwritten extent  		 * ioend for conversion on read-only mappings.  		 */ -		ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); +		ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);  	} else  		ret = filemap_fault(vma, vmf);  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1546,6 +1561,13 @@ xfs_filemap_fault(  	return ret;  } +/* + * Similar to xfs_filemap_fault(), the DAX fault path can call into here on + * both read and write faults. Hence we need to handle both cases. There is no + * ->pmd_mkwrite callout for huge pages, so we have a single function here to + * handle both cases here. @flags carries the information on the type of fault + * occuring. + */  STATIC int  xfs_filemap_pmd_fault(  	struct vm_area_struct	*vma, @@ -1562,15 +1584,54 @@ xfs_filemap_pmd_fault(  	trace_xfs_filemap_pmd_fault(ip); -	sb_start_pagefault(inode->i_sb); -	file_update_time(vma->vm_file); +	if (flags & FAULT_FLAG_WRITE) { +		sb_start_pagefault(inode->i_sb); +		file_update_time(vma->vm_file); +	} +  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); -	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, -				    xfs_end_io_dax_write); +	ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, +			      NULL);  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); -	sb_end_pagefault(inode->i_sb); +	if (flags & FAULT_FLAG_WRITE) +		sb_end_pagefault(inode->i_sb); + +	return ret; +} + +/* + * pfn_mkwrite was originally inteneded to ensure we capture time stamp + * updates on write faults. In reality, it's need to serialise against + * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() + * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault + * barrier in place. + */ +static int +xfs_filemap_pfn_mkwrite( +	struct vm_area_struct	*vma, +	struct vm_fault		*vmf) +{ + +	struct inode		*inode = file_inode(vma->vm_file); +	struct xfs_inode	*ip = XFS_I(inode); +	int			ret = VM_FAULT_NOPAGE; +	loff_t			size; + +	trace_xfs_filemap_pfn_mkwrite(ip); + +	sb_start_pagefault(inode->i_sb); +	file_update_time(vma->vm_file); + +	/* check if the faulting page hasn't raced with truncate */ +	xfs_ilock(ip, XFS_MMAPLOCK_SHARED); +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; +	if (vmf->pgoff >= size) +		ret = VM_FAULT_SIGBUS; +	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); +	sb_end_pagefault(inode->i_sb);  	return ret; +  }  static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1578,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {  	.pmd_fault	= xfs_filemap_pmd_fault,  	.map_pages	= filemap_map_pages,  	.page_mkwrite	= xfs_filemap_page_mkwrite, +	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,  };  STATIC int | 
