From ae42a154ca8972739be29f811a69bef6c4818a26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 17:39:39 +0100
Subject: btrfs: pass a btrfs_bio to btrfs_submit_bio

btrfs_submit_bio expects the bio passed to it to be embedded into a
btrfs_bio structure.  Pass the btrfs_bio directly to increase type
safety and make the code self-documenting.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 726592868e9c..c04e103f8768 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -164,7 +164,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
 			goto done;
 		}
 
-		btrfs_submit_bio(&repair_bbio->bio, mirror);
+		btrfs_submit_bio(repair_bbio, mirror);
 		return;
 	}
 
@@ -232,7 +232,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 
 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
 	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
-	btrfs_submit_bio(repair_bio, mirror);
+	btrfs_submit_bio(repair_bbio, mirror);
 	return fbio;
 }
 
@@ -603,12 +603,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 	return true;
 }
 
-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 {
-	struct btrfs_bio *bbio = btrfs_bio(bio);
 	struct btrfs_inode *inode = bbio->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_bio *orig_bbio = bbio;
+	struct bio *bio = &bbio->bio;
 	u64 logical = bio->bi_iter.bi_sector << 9;
 	u64 length = bio->bi_iter.bi_size;
 	u64 map_length = length;
@@ -650,7 +650,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
 		if (use_append) {
 			bio->bi_opf &= ~REQ_OP_WRITE;
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
-			ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
+			ret = btrfs_extract_ordered_extent(bbio);
 			if (ret)
 				goto fail_put_bio;
 		}
@@ -686,9 +686,9 @@ fail:
 	return true;
 }
 
-void btrfs_submit_bio(struct bio *bio, int mirror_num)
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
 {
-	while (!btrfs_submit_chunk(bio, mirror_num))
+	while (!btrfs_submit_chunk(bbio, mirror_num))
 		;
 }
 
-- 
cgit v1.2.3


From b41bbd293e64016b3bfad4e5f709fcc07f00b2c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 17:39:44 +0100
Subject: btrfs: return a btrfs_bio from btrfs_bio_alloc

Return the containing struct btrfs_bio instead of the less type safe
struct bio from btrfs_bio_alloc.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c       | 12 +++++++-----
 fs/btrfs/bio.h       |  6 +++---
 fs/btrfs/extent_io.c | 18 +++++++++---------
 fs/btrfs/inode.c     | 18 +++++++++---------
 4 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index c04e103f8768..527081abca02 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -48,15 +48,17 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
  * a mempool.
  */
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    struct btrfs_inode *inode,
-			    btrfs_bio_end_io_t end_io, void *private)
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+				  struct btrfs_inode *inode,
+				  btrfs_bio_end_io_t end_io, void *private)
 {
+	struct btrfs_bio *bbio;
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
-	btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
-	return bio;
+	bbio = btrfs_bio(bio);
+	btrfs_bio_init(bbio, inode, end_io, private);
+	return bbio;
 }
 
 static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index b4e7d5ab7d23..dbf125f6fa33 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -75,9 +75,9 @@ void __cold btrfs_bioset_exit(void);
 
 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
 		    btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    struct btrfs_inode *inode,
-			    btrfs_bio_end_io_t end_io, void *private);
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+				  struct btrfs_inode *inode,
+				  btrfs_bio_end_io_t end_io, void *private);
 
 static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7d893104425..1221f699ffc5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -896,13 +896,13 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 			  u64 disk_bytenr, u64 file_offset)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct bio *bio;
+	struct btrfs_bio *bbio;
 
-	bio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
-			      bio_ctrl->end_io_func, NULL);
-	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
-	btrfs_bio(bio)->file_offset = file_offset;
-	bio_ctrl->bbio = btrfs_bio(bio);
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
+			       bio_ctrl->end_io_func, NULL);
+	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->file_offset = file_offset;
+	bio_ctrl->bbio = bbio;
 	bio_ctrl->len_to_oe_boundary = U32_MAX;
 
 	/*
@@ -911,7 +911,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 	 * them.
 	 */
 	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
-	    btrfs_use_zone_append(btrfs_bio(bio))) {
+	    btrfs_use_zone_append(bbio)) {
 		struct btrfs_ordered_extent *ordered;
 
 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
@@ -930,8 +930,8 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 		 * to always be set on the last added/replaced device.
 		 * This is a bit odd but has been like that for a long time.
 		 */
-		bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
-		wbc_init_bio(bio_ctrl->wbc, bio);
+		bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
+		wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
 	}
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b5a82d22dbd1..76d93b9e94a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9928,24 +9928,24 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 		.pending = ATOMIC_INIT(1),
 	};
 	unsigned long i = 0;
-	struct bio *bio;
+	struct btrfs_bio *bbio;
 
 	init_waitqueue_head(&priv.wait);
 
-	bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
 			      btrfs_encoded_read_endio, &priv);
-	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 
 	do {
 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 
-		if (bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
 			atomic_inc(&priv.pending);
-			btrfs_submit_bio(btrfs_bio(bio), 0);
+			btrfs_submit_bio(bbio, 0);
 
-			bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
-					      btrfs_encoded_read_endio, &priv);
-			bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
+					       btrfs_encoded_read_endio, &priv);
+			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 			continue;
 		}
 
@@ -9955,7 +9955,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	} while (disk_io_size);
 
 	atomic_inc(&priv.pending);
-	btrfs_submit_bio(btrfs_bio(bio), 0);
+	btrfs_submit_bio(bbio, 0);
 
 	if (atomic_dec_return(&priv.pending))
 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
-- 
cgit v1.2.3


From 2cef0c79bb81d8bae1dbc45195771a824ca45e76 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Mar 2023 17:39:45 +0100
Subject: btrfs: make btrfs_split_bio work on struct btrfs_bio

btrfs_split_bio expects a btrfs_bio as argument and always allocates one.
Type both the orig_bio argument and the return value as struct btrfs_bio
to improve type safety.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 527081abca02..cf09c6271edb 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -61,30 +61,31 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
 	return bbio;
 }
 
-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
-				   struct bio *orig, u64 map_length,
-				   bool use_append)
+static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+					 struct btrfs_bio *orig_bbio,
+					 u64 map_length, bool use_append)
 {
-	struct btrfs_bio *orig_bbio = btrfs_bio(orig);
+	struct btrfs_bio *bbio;
 	struct bio *bio;
 
 	if (use_append) {
 		unsigned int nr_segs;
 
-		bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+		bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
 				   &btrfs_clone_bioset, map_length);
 	} else {
-		bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
-				&btrfs_clone_bioset);
+		bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
+				GFP_NOFS, &btrfs_clone_bioset);
 	}
-	btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
+	bbio = btrfs_bio(bio);
+	btrfs_bio_init(bbio, orig_bbio->inode, NULL, orig_bbio);
 
-	btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
-	if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+	bbio->file_offset = orig_bbio->file_offset;
+	if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
 		orig_bbio->file_offset += map_length;
 
 	atomic_inc(&orig_bbio->pending_ios);
-	return bio;
+	return bbio;
 }
 
 static void btrfs_orig_write_end_io(struct bio *bio);
@@ -633,8 +634,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		map_length = min(map_length, fs_info->max_zone_append_size);
 
 	if (map_length < length) {
-		bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
-		bbio = btrfs_bio(bio);
+		bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+		bio = &bbio->bio;
 	}
 
 	/*
-- 
cgit v1.2.3


From 078e4cf5dbedb3d407417a10ac5918dca6ca156a Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 30 Mar 2023 03:43:50 -0700
Subject: btrfs: use __bio_add_page for adding a single page in
 repair_one_sector

The btrfs repair bio submission code uses bio_add_page() to add a page
to a newly created bio. bio_add_page() can fail, but the return value is
never checked.

Use __bio_add_page() as adding a single page to a newly created bio is
guaranteed to succeed.

This brings us a step closer to marking bio_add_page() as __must_check.

Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index cf09c6271edb..89c1a0d7e89f 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -227,7 +227,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
 				      &btrfs_repair_bioset);
 	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
-	bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+	__bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
 
 	repair_bbio = btrfs_bio(repair_bio);
 	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
-- 
cgit v1.2.3


From 7edd339c8a416afed9d58b5d20778d5ee49e079f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Mar 2023 14:19:55 +0900
Subject: btrfs: pass an ordered_extent to btrfs_extract_ordered_extent

To prepare for a new caller that already has the ordered_extent
available, change btrfs_extract_ordered_extent to take an argument
for it.  Add a wrapper for the bio case that still has to do the
lookup (for now).

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c         | 16 +++++++++++++++-
 fs/btrfs/btrfs_inode.h |  3 ++-
 fs/btrfs/inode.c       | 26 ++++++++------------------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 89c1a0d7e89f..afd2f90fdbff 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -61,6 +61,20 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
 	return bbio;
 }
 
+static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
+{
+	struct btrfs_ordered_extent *ordered;
+	int ret;
+
+	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+	if (WARN_ON_ONCE(!ordered))
+		return BLK_STS_IOERR;
+	ret = btrfs_extract_ordered_extent(bbio, ordered);
+	btrfs_put_ordered_extent(ordered);
+
+	return errno_to_blk_status(ret);
+}
+
 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
 					 struct btrfs_bio *orig_bbio,
 					 u64 map_length, bool use_append)
@@ -653,7 +667,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		if (use_append) {
 			bio->bi_opf &= ~REQ_OP_WRITE;
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
-			ret = btrfs_extract_ordered_extent(bbio);
+			ret = btrfs_bio_extract_ordered_extent(bbio);
 			if (ret)
 				goto fail_put_bio;
 		}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9dc21622806e..bb4984480669 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -407,7 +407,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+				 struct btrfs_ordered_extent *ordered);
 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 			u32 bio_offset, struct bio_vec *bv);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 941fca1322b8..357d1626df1a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2601,37 +2601,27 @@ out_free_pre:
 	return ret;
 }
 
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+				 struct btrfs_ordered_extent *ordered)
 {
 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 	u64 len = bbio->bio.bi_iter.bi_size;
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_ordered_extent *ordered;
-	u64 ordered_len;
+	u64 ordered_len = ordered->num_bytes;
 	int ret = 0;
 
-	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
-	if (WARN_ON_ONCE(!ordered))
-		return BLK_STS_IOERR;
-	ordered_len = ordered->num_bytes;
-
 	/* Must always be called for the beginning of an ordered extent. */
-	if (WARN_ON_ONCE(start != ordered->disk_bytenr)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+		return -EINVAL;
 
 	/* No need to split if the ordered extent covers the entire bio. */
 	if (ordered->disk_num_bytes == len)
-		goto out;
+		return 0;
 
 	ret = btrfs_split_ordered_extent(ordered, len);
 	if (ret)
-		goto out;
-	ret = split_extent_map(inode, bbio->file_offset, ordered_len, len);
-out:
-	btrfs_put_ordered_extent(ordered);
-	return errno_to_blk_status(ret);
+		return ret;
+	return split_extent_map(inode, bbio->file_offset, ordered_len, len);
 }
 
 /*
-- 
cgit v1.2.3


From 3480373ebdf7625ee29bee6508c9fc4ae70c00bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Mar 2023 09:49:51 +0900
Subject: btrfs, block: move REQ_CGROUP_PUNT to btrfs

REQ_CGROUP_PUNT is a bit annoying as it is hard to follow and adds
a branch to the bio submission hot path.  To fix this, export
blkcg_punt_bio_submit and let btrfs call it directly.  Add a new
REQ_FS_PRIVATE flag for btrfs to indicate to it's own low-level
bio submission code that a punt to the cgroup submission helper
is required.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 block/blk-cgroup.c        | 31 +++++++++++++++++--------------
 block/blk-cgroup.h        | 12 ------------
 block/blk-core.c          |  3 ---
 fs/btrfs/bio.c            | 12 ++++++++----
 fs/btrfs/bio.h            |  3 +++
 fs/btrfs/extent_io.c      |  2 +-
 fs/btrfs/inode.c          |  2 +-
 include/linux/bio.h       |  5 +++++
 include/linux/blk_types.h | 18 +++++-------------
 9 files changed, 40 insertions(+), 48 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bd50b55bdb61..9f5f3263c178 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1688,24 +1688,27 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
-bool __blkcg_punt_bio_submit(struct bio *bio)
+/*
+ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
+ * lead to priority inversions as the kthread can be trapped waiting for that
+ * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
+ * a dedicated per-blkcg work item to avoid such priority inversions.
+ */
+void blkcg_punt_bio_submit(struct bio *bio)
 {
 	struct blkcg_gq *blkg = bio->bi_blkg;
 
-	/* consume the flag first */
-	bio->bi_opf &= ~REQ_CGROUP_PUNT;
-
-	/* never bounce for the root cgroup */
-	if (!blkg->parent)
-		return false;
-
-	spin_lock_bh(&blkg->async_bio_lock);
-	bio_list_add(&blkg->async_bios, bio);
-	spin_unlock_bh(&blkg->async_bio_lock);
-
-	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
-	return true;
+	if (blkg->parent) {
+		spin_lock_bh(&blkg->async_bio_lock);
+		bio_list_add(&blkg->async_bios, bio);
+		spin_unlock_bh(&blkg->async_bio_lock);
+		queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	} else {
+		/* never bounce for the root cgroup */
+		submit_bio(bio);
+	}
 }
+EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
 
 /*
  * Scale the accumulated delay based on how long it has been since we updated
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9c5078755e5e..64758ab9f1f1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -375,16 +375,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),	\
 					    (p_blkg)->q)))
 
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_CGROUP_PUNT)
-		return __blkcg_punt_bio_submit(bio);
-	else
-		return false;
-}
-
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
@@ -506,8 +496,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return
 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline void blk_cgroup_bio_start(struct bio *bio) { }
 static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 42926e6cb83c..478978dcb2bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct);
  */
 void submit_bio(struct bio *bio)
 {
-	if (blkcg_punt_bio_submit(bio))
-		return;
-
 	if (bio_op(bio) == REQ_OP_READ) {
 		task_io_account_read(bio->bi_iter.bi_size);
 		count_vm_events(PGPGIN, bio_sectors(bio));
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index afd2f90fdbff..ed5aa8a176b9 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -435,7 +435,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		dev->devid, bio->bi_iter.bi_size);
 
 	btrfsic_check_bio(bio);
-	submit_bio(bio);
+
+	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
+		blkcg_punt_bio_submit(bio);
+	else
+		submit_bio(bio);
 }
 
 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
@@ -551,10 +555,10 @@ static void run_one_async_done(struct btrfs_work *work)
 
 	/*
 	 * All of the bios that pass through here are from async helpers.
-	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
-	 * This changes nothing when cgroups aren't in use.
+	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
+	 * context.  This changes nothing when cgroups aren't in use.
 	 */
-	bio->bi_opf |= REQ_CGROUP_PUNT;
+	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
 }
 
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index dbf125f6fa33..8edf3c35eead 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -88,6 +88,9 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 /* Bio only refers to one ordered extent. */
 #define REQ_BTRFS_ONE_ORDERED			REQ_DRV
 
+/* Submit using blkcg_punt_bio_submit. */
+#define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
+
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f5702b1e2b86..f40e4a002f78 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2538,7 +2538,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_bio_ctrl bio_ctrl = {
 		.wbc = &wbc_writepages,
 		/* We're called from an async helper function */
-		.opf = REQ_OP_WRITE | REQ_CGROUP_PUNT |
+		.opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
 			wbc_to_write_flags(&wbc_writepages),
 		.extent_locked = 1,
 	};
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c216cab2076..93e16a408f43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1616,7 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
 		if (blkcg_css != blkcg_root_css) {
 			css_get(blkcg_css);
 			async_chunk[i].blkcg_css = blkcg_css;
-			async_chunk[i].write_flags |= REQ_CGROUP_PUNT;
+			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 		} else {
 			async_chunk[i].blkcg_css = NULL;
 		}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d766be7152e1..b3e7529ff55e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_clone_blkg_association(struct bio *dst, struct bio *src);
+void blkcg_punt_bio_submit(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio,
 { }
 static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
+static inline void blkcg_punt_bio_submit(struct bio *bio)
+{
+	submit_bio(bio);
+}
 #endif	/* CONFIG_BLK_CGROUP */
 
 static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..fb8843990d28 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -404,18 +404,11 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
-	/*
-	 * When a shared kthread needs to issue a bio for a cgroup, doing
-	 * so synchronously can lead to priority inversions as the kthread
-	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
-	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
-	 * work item to avoid such priority inversions.
-	 */
-	__REQ_CGROUP_PUNT,
 	__REQ_POLLED,		/* caller polls for completion using bio_poll */
 	__REQ_ALLOC_CACHE,	/* allocate IO from cache if available */
 	__REQ_SWAP,		/* swap I/O */
 	__REQ_DRV,		/* for driver use */
+	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 
 	/*
 	 * Command specific flags, keep last:
@@ -443,14 +436,13 @@ enum req_flag_bits {
 #define REQ_RAHEAD	(__force blk_opf_t)(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND	(__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT	(__force blk_opf_t)(1ULL << __REQ_NOWAIT)
-#define REQ_CGROUP_PUNT	(__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
-
-#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 #define REQ_POLLED	(__force blk_opf_t)(1ULL << __REQ_POLLED)
 #define REQ_ALLOC_CACHE	(__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
-
-#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_SWAP	(__force blk_opf_t)(1ULL << __REQ_SWAP)
+#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
+#define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
+
+#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-- 
cgit v1.2.3


From 4317ff0056bedfc472202bf4ccf72d51094d6ade Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 23 Mar 2023 17:01:20 +0800
Subject: btrfs: introduce btrfs_bio::fs_info member

Currently we're doing a lot of work for btrfs_bio:

- Checksum verification for data read bios
- Bio splits if it crosses stripe boundary
- Read repair for data read bios

However for the incoming scrub patches, we don't want this extra
functionality at all, just plain logical + mirror -> physical mapping
ability.

Thus here we do the following changes:

- Introduce btrfs_bio::fs_info
  This is for the new scrub specific btrfs_bio, which would not populate
  btrfs_bio::inode.
  Thus we need such new member to grab a fs_info

  This new member will always be populated.

- Replace @inode argument with @fs_info for btrfs_bio_init() and its
  caller
  Since @inode is no longer a mandatory member, replace it with
  @fs_info, and let involved users populate @inode.

- Skip checksum verification and generation if @bbio->inode is NULL

- Add extra ASSERT()s
  To make sure:

  * bbio->inode is properly set for involved read repair path
  * if @file_offset is set, bbio->inode is also populated

- Grab @fs_info from @bbio directly
  We can no longer go @bbio->inode->root->fs_info, as bbio->inode can be
  NULL. This involves:

  * btrfs_simple_end_io()
  * should_async_write()
  * btrfs_wq_submit_bio()
  * btrfs_use_zone_append()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c         | 42 +++++++++++++++++++++++++-----------------
 fs/btrfs/bio.h         | 12 +++++++++---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/extent_io.c   |  3 ++-
 fs/btrfs/inode.c       | 13 +++++++++----
 fs/btrfs/zoned.c       |  4 ++--
 6 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ed5aa8a176b9..e40d1ababa08 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
  * is already initialized by the block layer.
  */
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 		    btrfs_bio_end_io_t end_io, void *private)
 {
 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-	bbio->inode = inode;
+	bbio->fs_info = fs_info;
 	bbio->end_io = end_io;
 	bbio->private = private;
 	atomic_set(&bbio->pending_ios, 1);
@@ -49,7 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
  * a mempool.
  */
 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-				  struct btrfs_inode *inode,
+				  struct btrfs_fs_info *fs_info,
 				  btrfs_bio_end_io_t end_io, void *private)
 {
 	struct btrfs_bio *bbio;
@@ -57,7 +57,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
 
 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
 	bbio = btrfs_bio(bio);
-	btrfs_bio_init(bbio, inode, end_io, private);
+	btrfs_bio_init(bbio, fs_info, end_io, private);
 	return bbio;
 }
 
@@ -92,8 +92,8 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
 				GFP_NOFS, &btrfs_clone_bioset);
 	}
 	bbio = btrfs_bio(bio);
-	btrfs_bio_init(bbio, orig_bbio->inode, NULL, orig_bbio);
-
+	btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
+	bbio->inode = orig_bbio->inode;
 	bbio->file_offset = orig_bbio->file_offset;
 	if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
 		orig_bbio->file_offset += map_length;
@@ -244,7 +244,8 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 	__bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
 
 	repair_bbio = btrfs_bio(repair_bio);
-	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+	btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
+	repair_bbio->inode = failed_bbio->inode;
 	repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
 
 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
@@ -263,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
 	struct btrfs_failed_bio *fbio = NULL;
 	u32 offset = 0;
 
+	/* Read-repair requires the inode field to be set by the submitter. */
+	ASSERT(inode);
+
 	/*
 	 * Hand off repair bios to the repair code as there is no upper level
 	 * submitter for them.
@@ -323,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
 
 	/* Metadata reads are checked and repaired by the submitter. */
-	if (bbio->bio.bi_opf & REQ_META)
-		bbio->end_io(bbio);
-	else
+	if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
 		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+	else
+		bbio->end_io(bbio);
 }
 
 static void btrfs_simple_end_io(struct bio *bio)
 {
 	struct btrfs_bio *bbio = btrfs_bio(bio);
 	struct btrfs_device *dev = bio->bi_private;
-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 
 	btrfs_bio_counter_dec(fs_info);
 
@@ -357,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
 
 	btrfs_bio_counter_dec(bioc->fs_info);
 	bbio->mirror_num = bioc->mirror_num;
-	if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+	if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
+	    !(bbio->bio.bi_opf & REQ_META))
 		btrfs_check_read_bio(bbio, NULL);
 	else
 		btrfs_orig_bbio_end_io(bbio);
@@ -583,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
 	 * in order.
 	 */
 	if (bbio->bio.bi_opf & REQ_META) {
-		struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+		struct btrfs_fs_info *fs_info = bbio->fs_info;
 
 		if (btrfs_is_zoned(fs_info))
 			return false;
@@ -603,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 				struct btrfs_io_context *bioc,
 				struct btrfs_io_stripe *smap, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct async_submit_bio *async;
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -627,7 +632,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 {
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct btrfs_bio *orig_bbio = bbio;
 	struct bio *bio = &bbio->bio;
 	u64 logical = bio->bi_iter.bi_sector << 9;
@@ -660,7 +665,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 	 * Save the iter for the end_io handler and preload the checksums for
 	 * data reads.
 	 */
-	if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
+	if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
 		bbio->saved_iter = bio->bi_iter;
 		ret = btrfs_lookup_bio_sums(bbio);
 		if (ret)
@@ -680,7 +685,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		 * Csum items for reloc roots have already been cloned at this
 		 * point, so they are handled as part of the no-checksum case.
 		 */
-		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
 		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
 		    !btrfs_is_data_reloc_root(inode->root)) {
 			if (should_async_write(bbio) &&
@@ -709,6 +714,9 @@ fail:
 
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
 {
+	/* If bbio->inode is not populated, its file_offset must be 0. */
+	ASSERT(bbio->inode || bbio->file_offset == 0);
+
 	while (!btrfs_submit_chunk(bbio, mirror_num))
 		;
 }
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index 8edf3c35eead..51b4f3d93f04 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
  * passed to btrfs_submit_bio for mapping to the physical devices.
  */
 struct btrfs_bio {
-	/* Inode and offset into it that this I/O operates on. */
+	/*
+	 * Inode and offset into it that this I/O operates on.
+	 * Only set for data I/O.
+	 */
 	struct btrfs_inode *inode;
 	u64 file_offset;
 
@@ -58,6 +61,9 @@ struct btrfs_bio {
 	atomic_t pending_ios;
 	struct work_struct end_io_work;
 
+	/* File system that this I/O operates on. */
+	struct btrfs_fs_info *fs_info;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
@@ -73,10 +79,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
 int __init btrfs_bioset_init(void);
 void __cold btrfs_bioset_exit(void);
 
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
 		    btrfs_bio_end_io_t end_io, void *private);
 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-				  struct btrfs_inode *inode,
+				  struct btrfs_fs_info *fs_info,
 				  btrfs_bio_end_io_t end_io, void *private);
 
 static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d532a8c8c9d8..2d0493f0a184 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -69,7 +69,8 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
 
 	bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
 					  GFP_NOFS, &btrfs_compressed_bioset));
-	btrfs_bio_init(bbio, inode, end_io, NULL);
+	btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
+	bbio->inode = inode;
 	bbio->file_offset = start;
 	return to_compressed_bio(bbio);
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f40e4a002f78..a1adadd5d25d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -898,9 +898,10 @@ static void alloc_new_bio(struct btrfs_inode *inode,
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_bio *bbio;
 
-	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode,
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
 			       bio_ctrl->end_io_func, NULL);
 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->inode = inode;
 	bbio->file_offset = file_offset;
 	bio_ctrl->bbio = bbio;
 	bio_ctrl->len_to_oe_boundary = U32_MAX;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 93e16a408f43..57d070025c7a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7711,7 +7711,9 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 		container_of(bbio, struct btrfs_dio_private, bbio);
 	struct btrfs_dio_data *dio_data = iter->private;
 
-	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
+	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+		       btrfs_dio_end_io, bio->bi_private);
+	bbio->inode = BTRFS_I(iter->inode);
 	bbio->file_offset = file_offset;
 
 	dip->file_offset = file_offset;
@@ -9899,6 +9901,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  u64 file_offset, u64 disk_bytenr,
 					  u64 disk_io_size, struct page **pages)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_encoded_read_private priv = {
 		.pending = ATOMIC_INIT(1),
 	};
@@ -9907,9 +9910,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 
 	init_waitqueue_head(&priv.wait);
 
-	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
-			      btrfs_encoded_read_endio, &priv);
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+			       btrfs_encoded_read_endio, &priv);
 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+	bbio->inode = inode;
 
 	do {
 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
@@ -9918,9 +9922,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 			atomic_inc(&priv.pending);
 			btrfs_submit_bio(bbio, 0);
 
-			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode,
+			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 					       btrfs_encoded_read_endio, &priv);
 			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+			bbio->inode = inode;
 			continue;
 		}
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 45d04092f2f8..a9b32ba6b2ce 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 {
 	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
 	struct btrfs_inode *inode = bbio->inode;
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
 	struct btrfs_block_group *cache;
 	bool ret = false;
 
 	if (!btrfs_is_zoned(fs_info))
 		return false;
 
-	if (!is_data_inode(&inode->vfs_inode))
+	if (!inode || !is_data_inode(&inode->vfs_inode))
 		return false;
 
 	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
-- 
cgit v1.2.3


From 4886ff7b50f6341539611a1503a6ad2e55b77c98 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 20 Mar 2023 10:12:49 +0800
Subject: btrfs: introduce a new helper to submit write bio for repair

Both scrub and read-repair are utilizing a special repair writes that:

- Only writes back to a single device
  Even for read-repair on RAID56, we only update the corrupted data
  stripe itself, not triggering the full RMW path.

- Requires a valid @mirror_num
  For RAID56 case, only @mirror_num == 1 is valid.
  For non-RAID56 cases, we need @mirror_num to locate our stripe.

- No data csum generation needed

These two call sites still have some differences though:

- Read-repair goes plain bio
  It doesn't need a full btrfs_bio, and goes submit_bio_wait().

- New scrub repair would go btrfs_bio
  To simplify both read and write path.

So here this patch would:

- Introduce a common helper, btrfs_map_repair_block()
  Due to the single device nature, we can use an on-stack
  btrfs_io_stripe to pass device and its physical bytenr.

- Introduce a new interface, btrfs_submit_repair_bio(), for later scrub
  code
  This is for the incoming scrub code.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c     | 94 +++++++++++++++++++++++++++++-------------------------
 fs/btrfs/bio.h     |  1 +
 fs/btrfs/raid56.h  |  5 +++
 fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  3 ++
 5 files changed, 132 insertions(+), 44 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index e40d1ababa08..5379c4714905 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -735,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
 			    unsigned int pg_offset, int mirror_num)
 {
-	struct btrfs_device *dev;
+	struct btrfs_io_stripe smap = { 0 };
 	struct bio_vec bvec;
 	struct bio bio;
-	u64 map_length = 0;
-	u64 sector;
-	struct btrfs_io_context *bioc = NULL;
 	int ret = 0;
 
 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -749,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	if (btrfs_repair_one_zone(fs_info, logical))
 		return 0;
 
-	map_length = length;
-
 	/*
 	 * Avoid races with device replace and make sure our bioc has devices
 	 * associated to its stripes that don't go away while we are doing the
 	 * read repair operation.
 	 */
 	btrfs_bio_counter_inc_blocked(fs_info);
-	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
-		/*
-		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
-		 * to update all raid stripes, but here we just want to correct
-		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
-		 * stripe's dev and sector.
-		 */
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-				      &map_length, &bioc, 0);
-		if (ret)
-			goto out_counter_dec;
-		ASSERT(bioc->mirror_num == 1);
-	} else {
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-				      &map_length, &bioc, mirror_num);
-		if (ret)
-			goto out_counter_dec;
-		/*
-		 * This happens when dev-replace is also running, and the
-		 * mirror_num indicates the dev-replace target.
-		 *
-		 * In this case, we don't need to do anything, as the read
-		 * error just means the replace progress hasn't reached our
-		 * read range, and later replace routine would handle it well.
-		 */
-		if (mirror_num != bioc->mirror_num)
-			goto out_counter_dec;
-	}
-
-	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
-	dev = bioc->stripes[bioc->mirror_num - 1].dev;
-	btrfs_put_bioc(bioc);
+	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+	if (ret < 0)
+		goto out_counter_dec;
 
-	if (!dev || !dev->bdev ||
-	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+	if (!smap.dev->bdev ||
+	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
 		ret = -EIO;
 		goto out_counter_dec;
 	}
 
-	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
-	bio.bi_iter.bi_sector = sector;
+	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
 	__bio_add_page(&bio, page, length, pg_offset);
 
 	btrfsic_check_bio(&bio);
 	ret = submit_bio_wait(&bio);
 	if (ret) {
 		/* try to remap that extent elsewhere? */
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		goto out_bio_uninit;
 	}
 
 	btrfs_info_rl_in_rcu(fs_info,
 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
-			     ino, start, btrfs_dev_name(dev), sector);
+			     ino, start, btrfs_dev_name(smap.dev),
+			     smap.physical >> SECTOR_SHIFT);
 	ret = 0;
 
 out_bio_uninit:
@@ -820,6 +787,45 @@ out_counter_dec:
 	return ret;
 }
 
+/*
+ * Submit a btrfs_bio based repair write.
+ *
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
+ */
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
+{
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
+	u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+	u64 length = bbio->bio.bi_iter.bi_size;
+	struct btrfs_io_stripe smap = { 0 };
+	int ret;
+
+	ASSERT(fs_info);
+	ASSERT(mirror_num > 0);
+	ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+	ASSERT(!bbio->inode);
+
+	btrfs_bio_counter_inc_blocked(fs_info);
+	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+	if (ret < 0)
+		goto fail;
+
+	if (dev_replace) {
+		if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
+			bbio->bio.bi_opf &= ~REQ_OP_WRITE;
+			bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+		}
+		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
+		smap.dev = fs_info->dev_replace.tgtdev;
+	}
+	__btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+	return;
+
+fail:
+	btrfs_bio_counter_dec(fs_info);
+	btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+}
+
 int __init btrfs_bioset_init(void)
 {
 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index 51b4f3d93f04..a8eca3a65673 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -98,6 +98,7 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 #define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
 
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 			    u64 length, u64 logical, struct page *page,
 			    unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index df0e0abdeb1f..6583c225b1bd 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
 }
 
+static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
+{
+	return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+}
+
 #define RAID5_P_STRIPE ((u64)-2)
 #define RAID6_Q_STRIPE ((u64)-1)
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c201d72f798e..db6e15205be5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -8019,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 
 	return true;
 }
+
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
+				    struct btrfs_io_stripe *smap,
+				    u64 logical)
+{
+	int data_stripes = nr_bioc_data_stripes(bioc);
+	int i;
+
+	for (i = 0; i < data_stripes; i++) {
+		u64 stripe_start = bioc->full_stripe_logical +
+				   (i << BTRFS_STRIPE_LEN_SHIFT);
+
+		if (logical >= stripe_start &&
+		    logical < stripe_start + BTRFS_STRIPE_LEN)
+			break;
+	}
+	ASSERT(i < data_stripes);
+	smap->dev = bioc->stripes[i].dev;
+	smap->physical = bioc->stripes[i].physical +
+			((logical - bioc->full_stripe_logical) &
+			 BTRFS_STRIPE_LEN_MASK);
+}
+
+/*
+ * Map a repair write into a single device.
+ *
+ * A repair write is triggered by read time repair or scrub, which would only
+ * update the contents of a single device.
+ * Not update any other mirrors nor go through RMW path.
+ *
+ * Callers should ensure:
+ *
+ * - Call btrfs_bio_counter_inc_blocked() first
+ * - The range does not cross stripe boundary
+ * - Has a valid @mirror_num passed in.
+ */
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+			   struct btrfs_io_stripe *smap, u64 logical,
+			   u32 length, int mirror_num)
+{
+	struct btrfs_io_context *bioc = NULL;
+	u64 map_length = length;
+	int mirror_ret = mirror_num;
+	int ret;
+
+	ASSERT(mirror_num > 0);
+
+	ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+				&bioc, smap, &mirror_ret, true);
+	if (ret < 0)
+		return ret;
+
+	/* The map range should not cross stripe boundary. */
+	ASSERT(map_length >= length);
+
+	/* Already mapped to single stripe. */
+	if (!bioc)
+		goto out;
+
+	/* Map the RAID56 multi-stripe writes to a single one. */
+	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		map_raid56_repair_block(bioc, smap, logical);
+		goto out;
+	}
+
+	ASSERT(mirror_num <= bioc->num_stripes);
+	smap->dev = bioc->stripes[mirror_num - 1].dev;
+	smap->physical = bioc->stripes[mirror_num - 1].physical;
+out:
+	btrfs_put_bioc(bioc);
+	ASSERT(smap->dev);
+	return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 650e131d079e..bf47a1a70813 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -587,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		      struct btrfs_io_context **bioc_ret,
 		      struct btrfs_io_stripe *smap, int *mirror_num_ret,
 		      int need_raid_map);
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+			   struct btrfs_io_stripe *smap, u64 logical,
+			   u32 length, int mirror_num);
 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 					       u64 logical, u64 *length_ret,
 					       u32 *num_stripes);
-- 
cgit v1.2.3


From 45c2f36871955b51b4ce083c447388d8c72d6b91 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 15 May 2023 11:18:21 +0200
Subject: btrfs: call btrfs_orig_bbio_end_io in btrfs_end_bio_work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When I implemented the storage layer bio splitting, I was under the
assumption that we'll never split metadata bios.  But Qu reminded me that
this can actually happen with very old file systems with unaligned
metadata chunks and RAID0.

I still haven't seen such a case in practice, but we better handled this
case, especially as it is fairly easily to do not calling the ->end_іo
method directly in btrfs_end_io_work, and using the proper
btrfs_orig_bbio_end_io helper instead.

In addition to the old file system with unaligned metadata chunks case
documented in the commit log, the combination of the new scrub code
with Johannes pending raid-stripe-tree also triggers this case.  We
spent some time debugging it and found that this patch solves
the problem.

Fixes: 103c19723c80 ("btrfs: split the bio submission path into a separate file")
CC: stable@vger.kernel.org # 6.3+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 5379c4714905..35d34c731d72 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -330,7 +330,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
 	if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
 		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
 	else
-		bbio->end_io(bbio);
+		btrfs_orig_bbio_end_io(bbio);
 }
 
 static void btrfs_simple_end_io(struct bio *bio)
-- 
cgit v1.2.3


From b675df0257bb717082f592626da3ddfc5bdc2b6b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 1 Jun 2023 18:51:34 +0800
Subject: btrfs: zoned: fix dev-replace after the scrub rework

[BUG]
After commit e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror()
to scrub_stripe infrastructure"), scrub no longer works for zoned device
at all.

Even an empty zoned btrfs cannot be replaced:

  # mkfs.btrfs -f /dev/nvme0n1
  # mount /dev/nvme0n1 /mnt/btrfs
  # btrfs replace start -Bf 1 /dev/nvme0n2 /mnt/btrfs
  Resetting device zones /dev/nvme1n1 (160 zones) ...
  ERROR: ioctl(DEV_REPLACE_START) failed on "/mnt/btrfs/": Input/output error

And we can hit kernel crash related to that:

  BTRFS info (device nvme1n1): host-managed zoned block device /dev/nvme3n1, 160 zones of 134217728 bytes
  BTRFS info (device nvme1n1): dev_replace from /dev/nvme2n1 (devid 2) to /dev/nvme3n1 started
  nvme3n1: Zone Management Append(0x7d) @ LBA 65536, 4 blocks, Zone Is Full (sct 0x1 / sc 0xb9) DNR
  I/O error, dev nvme3n1, sector 786432 op 0xd:(ZONE_APPEND) flags 0x4000 phys_seg 3 prio class 2
  BTRFS error (device nvme1n1): bdev /dev/nvme3n1 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0
  BUG: kernel NULL pointer dereference, address: 00000000000000a8
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
  RIP: 0010:_raw_spin_lock_irqsave+0x1e/0x40
  Call Trace:
   <IRQ>
   btrfs_lookup_ordered_extent+0x31/0x190
   btrfs_record_physical_zoned+0x18/0x40
   btrfs_simple_end_io+0xaf/0xc0
   blk_update_request+0x153/0x4c0
   blk_mq_end_request+0x15/0xd0
   nvme_poll_cq+0x1d3/0x360
   nvme_irq+0x39/0x80
   __handle_irq_event_percpu+0x3b/0x190
   handle_irq_event+0x2f/0x70
   handle_edge_irq+0x7c/0x210
   __common_interrupt+0x34/0xa0
   common_interrupt+0x7d/0xa0
   </IRQ>
   <TASK>
   asm_common_interrupt+0x22/0x40

[CAUSE]
Dev-replace reuses scrub code to iterate all extents and write the
existing content back to the new device.

And for zoned devices, we call fill_writer_pointer_gap() to make sure
all the writes into the zoned device is sequential, even if there may be
some gaps between the writes.

However we have several different bugs all related to zoned dev-replace:

- We are using ZONE_APPEND operation for metadata style write back
  For zoned devices, btrfs has two ways to write data:

  * ZONE_APPEND for data
    This allows higher queue depth, but will not be able to know where
    the write would land.
    Thus needs to grab the real on-disk physical location in it's endio.

  * WRITE for metadata
    This requires single queue depth (new writes can only be submitted
    after previous one finished), and all writes must be sequential.

  For scrub, we go single queue depth, but still goes with ZONE_APPEND,
  which requires btrfs_bio::inode being populated.
  This is the cause of that crash.

- No correct tracing of write_pointer
  After a write finished, we should forward sctx->write_pointer, or
  fill_writer_pointer_gap() would not work properly and cause more
  than necessary zero out, and fill the whole zone prematurely.

- Incorrect physical bytenr passed to fill_writer_pointer_gap()
  In scrub_write_sectors(), one call site passes logical address, which
  is completely wrong.

  The other call site passes physical address of current sector, but
  we should pass the physical address of the btrfs_bio we're submitting.

  This is the cause of the -EIO errors.

[FIX]
- Do not use ZONE_APPEND for btrfs_submit_repair_write().

- Manually forward sctx->write_pointer after successful writeback

- Use the physical address of the to-be-submitted btrfs_bio for
  fill_writer_pointer_gap()

Now zoned device replace would work as expected.

Reported-by: Christoph Hellwig <hch@lst.de>
Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c   |  4 ----
 fs/btrfs/scrub.c | 48 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 32 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/bio.c')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 35d34c731d72..b3ad0f51e616 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -811,10 +811,6 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
 		goto fail;
 
 	if (dev_replace) {
-		if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
-			bbio->bio.bi_opf &= ~REQ_OP_WRITE;
-			bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
-		}
 		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
 		smap.dev = fs_info->dev_replace.tgtdev;
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index dd37cba58022..7c666517d3d3 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1137,6 +1137,35 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
 		wake_up(&stripe->io_wait);
 }
 
+static void scrub_submit_write_bio(struct scrub_ctx *sctx,
+				   struct scrub_stripe *stripe,
+				   struct btrfs_bio *bbio, bool dev_replace)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	u32 bio_len = bbio->bio.bi_iter.bi_size;
+	u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
+		      stripe->logical;
+
+	fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
+	atomic_inc(&stripe->pending_io);
+	btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+	if (!btrfs_is_zoned(fs_info))
+		return;
+	/*
+	 * For zoned writeback, queue depth must be 1, thus we must wait for
+	 * the write to finish before the next write.
+	 */
+	wait_scrub_stripe_io(stripe);
+
+	/*
+	 * And also need to update the write pointer if write finished
+	 * successfully.
+	 */
+	if (!test_bit(bio_off >> fs_info->sectorsize_bits,
+		      &stripe->write_error_bitmap))
+		sctx->write_pointer += bio_len;
+}
+
 /*
  * Submit the write bio(s) for the sectors specified by @write_bitmap.
  *
@@ -1155,7 +1184,6 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 {
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct btrfs_bio *bbio = NULL;
-	const bool zoned = btrfs_is_zoned(fs_info);
 	int sector_nr;
 
 	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
@@ -1168,13 +1196,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 
 		/* Cannot merge with previous sector, submit the current one. */
 		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
-			fill_writer_pointer_gap(sctx, stripe->physical +
-					(sector_nr << fs_info->sectorsize_bits));
-			atomic_inc(&stripe->pending_io);
-			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
-			/* For zoned writeback, queue depth must be 1. */
-			if (zoned)
-				wait_scrub_stripe_io(stripe);
+			scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
 			bbio = NULL;
 		}
 		if (!bbio) {
@@ -1187,14 +1209,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
 		ASSERT(ret == fs_info->sectorsize);
 	}
-	if (bbio) {
-		fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
-					SECTOR_SHIFT);
-		atomic_inc(&stripe->pending_io);
-		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
-		if (zoned)
-			wait_scrub_stripe_io(stripe);
-	}
+	if (bbio)
+		scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
 }
 
 /*
-- 
cgit v1.2.3