diff options
Diffstat (limited to 'fs/btrfs/file.c')
| -rw-r--r-- | fs/btrfs/file.c | 374 | 
1 files changed, 133 insertions, 241 deletions
| diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b06..588c353d2969 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -37,33 +37,30 @@  #include "file.h"  #include "super.h" -/* simple helper to fault in pages and copy.  This should go away - * and be replaced with calls into generic code. +/* + * Helper to fault in page and copy.  This should go away and be replaced with + * calls into generic code.   */  static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, -					 struct page **prepared_pages, -					 struct iov_iter *i) +					 struct folio *folio, struct iov_iter *i)  {  	size_t copied = 0;  	size_t total_copied = 0; -	int pg = 0;  	int offset = offset_in_page(pos);  	while (write_bytes > 0) { -		size_t count = min_t(size_t, -				     PAGE_SIZE - offset, write_bytes); -		struct page *page = prepared_pages[pg]; +		size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);  		/*  		 * Copy data from userspace to the current page  		 */ -		copied = copy_page_from_iter_atomic(page, offset, count, i); +		copied = copy_folio_from_iter_atomic(folio, offset, count, i);  		/* Flush processor's dcache for this page */ -		flush_dcache_page(page); +		flush_dcache_folio(folio);  		/*  		 * if we get a partial write, we can end up with -		 * partially up to date pages.  These add +		 * partially up to date page.  These add  		 * a lot of complexity, so make sure they don't  		 * happen by forcing this copy to be retried.  		 * @@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,  		 * back to page at a time copies after we return 0.  		 */  		if (unlikely(copied < count)) { -			if (!PageUptodate(page)) { +			if (!folio_test_uptodate(folio)) {  				iov_iter_revert(i, copied);  				copied = 0;  			} @@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,  		write_bytes -= copied;  		total_copied += copied;  		offset += copied; -		if (offset == PAGE_SIZE) { -			pg++; -			offset = 0; -		}  	}  	return total_copied;  }  /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it.   */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, -			     struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,  			     u64 pos, u64 copied)  { -	size_t i;  	u64 block_start = round_down(pos, fs_info->sectorsize);  	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;  	ASSERT(block_len <= U32_MAX); -	for (i = 0; i < num_pages; i++) { -		/* page checked is some magic around finding pages that -		 * have been modified without going through btrfs_set_page_dirty -		 * clear it here. There should be no need to mark the pages -		 * accessed as prepare_pages should have marked them accessed -		 * in prepare_pages via find_or_create_page() -		 */ -		btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), -						block_start, block_len); -		unlock_page(pages[i]); -		put_page(pages[i]); -	} +	/* +	 * Folio checked is some magic around finding folios that have been +	 * modified without going through btrfs_dirty_folio().  Clear it here. +	 * There should be no need to mark the pages accessed as +	 * prepare_one_folio() should have marked them accessed in +	 * prepare_one_folio() via find_or_create_page() +	 */ +	btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); +	folio_unlock(folio); +	folio_put(folio);  }  /*   * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * - Mark newly dirtied folio as DELALLOC in the io tree.   *   Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup   * - Update inode size for past EOF write   */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, -		      size_t num_pages, loff_t pos, size_t write_bytes, -		      struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, +		      size_t write_bytes, struct extent_state **cached, bool noreserve)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	int ret = 0; -	int i;  	u64 num_bytes;  	u64 start_pos;  	u64 end_of_last_block; @@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,  	num_bytes = round_up(write_bytes + pos - start_pos,  			     fs_info->sectorsize);  	ASSERT(num_bytes <= U32_MAX); +	ASSERT(folio_pos(folio) <= pos && +	       folio_pos(folio) + folio_size(folio) >= pos + write_bytes);  	end_of_last_block = start_pos + num_bytes - 1; @@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,  	if (ret)  		return ret; -	for (i = 0; i < num_pages; i++) { -		struct page *p = pages[i]; - -		btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), -					       start_pos, num_bytes); -		btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), -						start_pos, num_bytes); -		btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), -					    start_pos, num_bytes); -	} +	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); +	btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); +	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);  	/*  	 * we've only changed i_size in ram, and we haven't updated @@ -851,53 +833,47 @@ out:  }  /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0   */ -static int prepare_uptodate_page(struct inode *inode, -				 struct page *page, u64 pos, -				 bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, +				  u64 len, bool force_uptodate)  { -	struct folio *folio = page_folio(page); +	u64 clamp_start = max_t(u64, pos, folio_pos(folio)); +	u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));  	int ret = 0; -	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && -	    !PageUptodate(page)) { -		ret = btrfs_read_folio(NULL, folio); -		if (ret) -			return ret; -		lock_page(page); -		if (!PageUptodate(page)) { -			unlock_page(page); -			return -EIO; -		} - -		/* -		 * Since btrfs_read_folio() will unlock the folio before it -		 * returns, there is a window where btrfs_release_folio() can be -		 * called to release the page.  Here we check both inode -		 * mapping and PagePrivate() to make sure the page was not -		 * released. -		 * -		 * The private flag check is essential for subpage as we need -		 * to store extra bitmap using folio private. -		 */ -		if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { -			unlock_page(page); -			return -EAGAIN; -		} -	} -	return 0; -} +	if (folio_test_uptodate(folio)) +		return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ -	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; +	if (!force_uptodate && +	    IS_ALIGNED(clamp_start, PAGE_SIZE) && +	    IS_ALIGNED(clamp_end, PAGE_SIZE)) +		return 0; -	if (nowait) -		fgp_flags |= FGP_NOWAIT; +	ret = btrfs_read_folio(NULL, folio); +	if (ret) +		return ret; +	folio_lock(folio); +	if (!folio_test_uptodate(folio)) { +		folio_unlock(folio); +		return -EIO; +	} -	return fgp_flags; +	/* +	 * Since btrfs_read_folio() will unlock the folio before it returns, +	 * there is a window where btrfs_release_folio() can be called to +	 * release the page.  Here we check both inode mapping and page +	 * private to make sure the page was not released. +	 * +	 * The private flag check is essential for subpage as we need to store +	 * extra bitmap using folio private. +	 */ +	if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { +		folio_unlock(folio); +		return -EAGAIN; +	} +	return 0;  }  static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)  }  /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it.   */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, -				  size_t num_pages, loff_t pos, -				  size_t write_bytes, bool force_uptodate, -				  bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, +				      loff_t pos, size_t write_bytes, +				      bool force_uptodate, bool nowait)  { -	int i;  	unsigned long index = pos >> PAGE_SHIFT;  	gfp_t mask = get_prepare_gfp_flags(inode, nowait); -	fgf_t fgp_flags = get_prepare_fgp_flags(nowait); +	fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); +	struct folio *folio;  	int ret = 0; -	int faili; -	for (i = 0; i < num_pages; i++) {  again: -		pages[i] = pagecache_get_page(inode->i_mapping, index + i, -					      fgp_flags, mask | __GFP_WRITE); -		if (!pages[i]) { -			faili = i - 1; -			if (nowait) -				ret = -EAGAIN; -			else -				ret = -ENOMEM; -			goto fail; -		} - -		ret = set_page_extent_mapped(pages[i]); -		if (ret < 0) { -			faili = i; -			goto fail; -		} - -		if (i == 0) -			ret = prepare_uptodate_page(inode, pages[i], pos, -						    force_uptodate); -		if (!ret && i == num_pages - 1) -			ret = prepare_uptodate_page(inode, pages[i], -						    pos + write_bytes, false); -		if (ret) { -			put_page(pages[i]); -			if (!nowait && ret == -EAGAIN) { -				ret = 0; -				goto again; -			} -			faili = i - 1; -			goto fail; +	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); +	if (IS_ERR(folio)) { +		if (nowait) +			ret = -EAGAIN; +		else +			ret = PTR_ERR(folio); +		return ret; +	} +	/* Only support page sized folio yet. */ +	ASSERT(folio_order(folio) == 0); +	ret = set_folio_extent_mapped(folio); +	if (ret < 0) { +		folio_unlock(folio); +		folio_put(folio); +		return ret; +	} +	ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); +	if (ret) { +		/* The folio is already unlocked. */ +		folio_put(folio); +		if (!nowait && ret == -EAGAIN) { +			ret = 0; +			goto again;  		} -		wait_on_page_writeback(pages[i]); +		return ret;  	} - +	*folio_ret = folio;  	return 0; -fail: -	while (faili >= 0) { -		unlock_page(pages[faili]); -		put_page(pages[faili]); -		faili--; -	} -	return ret; -  }  /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need.   * - * The return value: + * Return:   * 1 - the extent is locked   * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again   */  static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, -				size_t num_pages, loff_t pos, -				size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, +				loff_t pos, size_t write_bytes,  				u64 *lockstart, u64 *lockend, bool nowait,  				struct extent_state **cached_state)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	u64 start_pos;  	u64 last_pos; -	int i;  	int ret = 0;  	start_pos = round_down(pos, fs_info->sectorsize); @@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,  		if (nowait) {  			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,  					     cached_state)) { -				for (i = 0; i < num_pages; i++) { -					unlock_page(pages[i]); -					put_page(pages[i]); -					pages[i] = NULL; -				} - +				folio_unlock(folio); +				folio_put(folio);  				return -EAGAIN;  			}  		} else { @@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,  		    ordered->file_offset <= last_pos) {  			unlock_extent(&inode->io_tree, start_pos, last_pos,  				      cached_state); -			for (i = 0; i < num_pages; i++) { -				unlock_page(pages[i]); -				put_page(pages[i]); -			} +			folio_unlock(folio); +			folio_put(folio);  			btrfs_start_ordered_extent(ordered);  			btrfs_put_ordered_extent(ordered);  			return -EAGAIN; @@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,  	}  	/* -	 * We should be called after prepare_pages() which should have locked +	 * We should be called after prepare_one_folio() which should have locked  	 * all pages in the range.  	 */ -	for (i = 0; i < num_pages; i++) -		WARN_ON(!PageLocked(pages[i])); +	WARN_ON(!folio_test_locked(folio));  	return ret;  } @@ -1120,27 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)  	btrfs_drew_write_unlock(&inode->root->snapshot_lock);  } -static void update_time_for_write(struct inode *inode) -{ -	struct timespec64 now, ts; - -	if (IS_NOCMTIME(inode)) -		return; - -	now = current_time(inode); -	ts = inode_get_mtime(inode); -	if (!timespec64_equal(&ts, &now)) -		inode_set_mtime_to_ts(inode, now); - -	ts = inode_get_ctime(inode); -	if (!timespec64_equal(&ts, &now)) -		inode_set_ctime_to_ts(inode, now); - -	if (IS_I_VERSION(inode)) -		inode_inc_iversion(inode); -} - -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file_inode(file); @@ -1170,7 +1097,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)  	 * need to start yet another transaction to update the inode as we will  	 * update the inode when we finish writing whatever data we write.  	 */ -	update_time_for_write(inode); +	if (!IS_NOCMTIME(inode)) { +		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); +		inode_inc_iversion(inode); +	}  	start_pos = round_down(pos, fs_info->sectorsize);  	oldsize = i_size_read(inode); @@ -1192,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)  	loff_t pos;  	struct inode *inode = file_inode(file);  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); -	struct page **pages = NULL;  	struct extent_changeset *data_reserved = NULL;  	u64 release_bytes = 0;  	u64 lockstart;  	u64 lockend;  	size_t num_written = 0; -	int nrptrs;  	ssize_t ret; -	bool only_release_metadata = false; -	bool force_page_uptodate = false;  	loff_t old_isize = i_size_read(inode);  	unsigned int ilock_flags = 0;  	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);  	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); +	bool only_release_metadata = false;  	if (nowait)  		ilock_flags |= BTRFS_ILOCK_TRY; @@ -1218,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)  	if (ret <= 0)  		goto out; -	ret = btrfs_write_check(iocb, i, ret); +	ret = btrfs_write_check(iocb, ret);  	if (ret < 0)  		goto out;  	pos = iocb->ki_pos; -	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), -			PAGE_SIZE / (sizeof(struct page *))); -	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); -	nrptrs = max(nrptrs, 8); -	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); -	if (!pages) { -		ret = -ENOMEM; -		goto out; -	} -  	while (iov_iter_count(i) > 0) {  		struct extent_state *cached_state = NULL;  		size_t offset = offset_in_page(pos);  		size_t sector_offset; -		size_t write_bytes = min(iov_iter_count(i), -					 nrptrs * (size_t)PAGE_SIZE - -					 offset); -		size_t num_pages; +		size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);  		size_t reserve_bytes; -		size_t dirty_pages;  		size_t copied;  		size_t dirty_sectors;  		size_t num_sectors; +		struct folio *folio = NULL;  		int extents_locked; +		bool force_page_uptodate = false;  		/* -		 * Fault pages before locking them in prepare_pages +		 * Fault pages before locking them in prepare_one_folio()  		 * to avoid recursive lock  		 */  		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { @@ -1288,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)  			only_release_metadata = true;  		} -		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); -		WARN_ON(num_pages > nrptrs);  		reserve_bytes = round_up(write_bytes + sector_offset,  					 fs_info->sectorsize);  		WARN_ON(reserve_bytes == 0); @@ -1317,23 +1230,17 @@ again:  			break;  		} -		/* -		 * This is going to setup the pages array with the number of -		 * pages we want, so we don't really need to worry about the -		 * contents of pages from loop to loop -		 */ -		ret = prepare_pages(inode, pages, num_pages, -				    pos, write_bytes, force_page_uptodate, false); +		ret = prepare_one_folio(inode, &folio, pos, write_bytes, +					force_page_uptodate, false);  		if (ret) {  			btrfs_delalloc_release_extents(BTRFS_I(inode),  						       reserve_bytes);  			break;  		} -		extents_locked = lock_and_cleanup_extent_if_need( -				BTRFS_I(inode), pages, -				num_pages, pos, write_bytes, &lockstart, -				&lockend, nowait, &cached_state); +		extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), +						folio, pos, write_bytes, &lockstart, +						&lockend, nowait, &cached_state);  		if (extents_locked < 0) {  			if (!nowait && extents_locked == -EAGAIN)  				goto again; @@ -1344,28 +1251,18 @@ again:  			break;  		} -		copied = btrfs_copy_from_user(pos, write_bytes, pages, i); +		copied = btrfs_copy_from_user(pos, write_bytes, folio, i);  		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);  		dirty_sectors = round_up(copied + sector_offset,  					fs_info->sectorsize);  		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); -		/* -		 * if we have trouble faulting in the pages, fall -		 * back to one page at a time -		 */ -		if (copied < write_bytes) -			nrptrs = 1; -  		if (copied == 0) {  			force_page_uptodate = true;  			dirty_sectors = 0; -			dirty_pages = 0;  		} else {  			force_page_uptodate = false; -			dirty_pages = DIV_ROUND_UP(copied + offset, -						   PAGE_SIZE);  		}  		if (num_sectors > dirty_sectors) { @@ -1375,13 +1272,10 @@ again:  				btrfs_delalloc_release_metadata(BTRFS_I(inode),  							release_bytes, true);  			} else { -				u64 __pos; - -				__pos = round_down(pos, -						   fs_info->sectorsize) + -					(dirty_pages << PAGE_SHIFT); +				u64 release_start = round_up(pos + copied, +							     fs_info->sectorsize);  				btrfs_delalloc_release_space(BTRFS_I(inode), -						data_reserved, __pos, +						data_reserved, release_start,  						release_bytes, true);  			}  		} @@ -1389,15 +1283,14 @@ again:  		release_bytes = round_up(copied + sector_offset,  					fs_info->sectorsize); -		ret = btrfs_dirty_pages(BTRFS_I(inode), pages, -					dirty_pages, pos, copied, +		ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,  					&cached_state, only_release_metadata);  		/*  		 * If we have not locked the extent range, because the range's  		 * start offset is >= i_size, we might still have a non-NULL  		 * cached extent state, acquired while marking the extent range -		 * as delalloc through btrfs_dirty_pages(). Therefore free any +		 * as delalloc through btrfs_dirty_page(). Therefore free any  		 * possible cached extent state to avoid a memory leak.  		 */  		if (extents_locked) @@ -1408,7 +1301,7 @@ again:  		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);  		if (ret) { -			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); +			btrfs_drop_folio(fs_info, folio, pos, copied);  			break;  		} @@ -1416,7 +1309,7 @@ again:  		if (only_release_metadata)  			btrfs_check_nocow_unlock(BTRFS_I(inode)); -		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); +		btrfs_drop_folio(fs_info, folio, pos, copied);  		cond_resched(); @@ -1424,8 +1317,6 @@ again:  		num_written += copied;  	} -	kfree(pages); -  	if (release_bytes) {  		if (only_release_metadata) {  			btrfs_check_nocow_unlock(BTRFS_I(inode)); @@ -1470,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	if (ret || encoded->len == 0)  		goto out; -	ret = btrfs_write_check(iocb, from, encoded->len); +	ret = btrfs_write_check(iocb, encoded->len);  	if (ret < 0)  		goto out; @@ -3802,6 +3693,7 @@ const struct file_operations btrfs_file_operations = {  	.compat_ioctl	= btrfs_compat_ioctl,  #endif  	.remap_file_range = btrfs_remap_file_range, +	.uring_cmd	= btrfs_uring_cmd,  	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,  }; | 
