summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c23
-rw-r--r--fs/xfs/libxfs/xfs_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_defer.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h25
-rw-r--r--fs/xfs/libxfs/xfs_fs.h7
-rw-r--r--fs/xfs/libxfs/xfs_health.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c33
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c6
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c276
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h23
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c7
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c19
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h11
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c757
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.h189
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c28
-rw-r--r--fs/xfs/libxfs/xfs_sb.c8
-rw-r--r--fs/xfs/libxfs/xfs_shared.h7
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c25
-rw-r--r--fs/xfs/scrub/agheader_repair.c2
-rw-r--r--fs/xfs/scrub/bmap.c30
-rw-r--r--fs/xfs/scrub/bmap_repair.c21
-rw-r--r--fs/xfs/scrub/common.c10
-rw-r--r--fs/xfs/scrub/common.h5
-rw-r--r--fs/xfs/scrub/cow_repair.c180
-rw-r--r--fs/xfs/scrub/health.c1
-rw-r--r--fs/xfs/scrub/inode.c31
-rw-r--r--fs/xfs/scrub/inode_repair.c57
-rw-r--r--fs/xfs/scrub/metapath.c3
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/quota_repair.c2
-rw-r--r--fs/xfs/scrub/reap.c247
-rw-r--r--fs/xfs/scrub/reap.h7
-rw-r--r--fs/xfs/scrub/refcount.c2
-rw-r--r--fs/xfs/scrub/refcount_repair.c6
-rw-r--r--fs/xfs/scrub/repair.c6
-rw-r--r--fs/xfs/scrub/repair.h7
-rw-r--r--fs/xfs/scrub/rgb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rmap_repair.c7
-rw-r--r--fs/xfs/scrub/rtb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rtbitmap.c2
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c24
-rw-r--r--fs/xfs/scrub/rtrefcount.c661
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c783
-rw-r--r--fs/xfs/scrub/rtrmap.c54
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c103
-rw-r--r--fs/xfs/scrub/scrub.c7
-rw-r--r--fs/xfs/scrub/scrub.h12
-rw-r--r--fs/xfs/scrub/stats.c1
-rw-r--r--fs/xfs/scrub/trace.h54
-rw-r--r--fs/xfs/xfs_buf_item_recover.c4
-rw-r--r--fs/xfs/xfs_fsmap.c25
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_health.c1
-rw-r--r--fs/xfs/xfs_inode_item.c14
-rw-r--r--fs/xfs/xfs_inode_item_recover.c4
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_refcount_item.c240
-rw-r--r--fs/xfs/xfs_reflink.c321
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_rtalloc.c24
-rw-r--r--fs/xfs/xfs_rtalloc.h5
-rw-r--r--fs/xfs/xfs_stats.c3
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c6
-rw-r--r--fs/xfs/xfs_trace.h111
74 files changed, 4328 insertions, 350 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 338e10f81b7b..7afa51e41427 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -51,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_rmap_btree.o \
xfs_refcount.o \
xfs_refcount_btree.o \
+ xfs_rtrefcount_btree.o \
xfs_rtrmap_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
@@ -194,6 +195,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rgsuper.o \
rtbitmap.o \
+ rtrefcount.o \
rtrmap.o \
rtsummary.o \
)
@@ -234,6 +236,7 @@ xfs-y += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtbitmap_repair.o \
+ rtrefcount_repair.o \
rtrmap_repair.o \
rtsummary_repair.o \
)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 02323936cc9b..40ad22fb808b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4564,8 +4564,9 @@ xfs_bmapi_write(
* the refcount btree for orphan recovery.
*/
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno,
- bma.length);
+ xfs_refcount_alloc_cow_extent(tp,
+ XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
}
/* Deal with the allocated space we found. */
@@ -4740,7 +4741,8 @@ xfs_bmapi_convert_one_delalloc(
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
- xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+ xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip),
+ bma.blkno, bma.length);
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -5388,7 +5390,7 @@ xfs_bmap_del_extent_real(
bool isrt = xfs_ifork_is_realtime(ip, whichfork);
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
- xfs_refcount_decrease_extent(tp, del);
+ xfs_refcount_decrease_extent(tp, isrt, del);
} else if (isrt && !xfs_has_rtgroups(mp)) {
error = xfs_bmap_free_rtblocks(tp, del);
} else {
@@ -6498,9 +6500,8 @@ xfs_get_extsz_hint(
* No point in aligning allocations if we need to COW to actually
* write to them.
*/
- if (xfs_is_always_cow_inode(ip))
- return 0;
- if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+ if (!xfs_is_always_cow_inode(ip) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
return ip->i_extsize;
if (XFS_IS_REALTIME_INODE(ip) &&
ip->i_mount->m_sb.sb_rextsize > 1)
@@ -6523,7 +6524,13 @@ xfs_get_cowextsz_hint(
a = 0;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
a = ip->i_cowextsize;
- b = xfs_get_extsz_hint(ip);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ b = 0;
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+ b = ip->i_extsize;
+ } else {
+ b = xfs_get_extsz_hint(ip);
+ }
a = max(a, b);
if (a == 0)
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 36ab06f8a3bc..299ce7fd11b0 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -35,6 +35,7 @@
#include "xfs_rmap.h"
#include "xfs_quota.h"
#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Btree magic numbers.
@@ -5535,6 +5536,9 @@ xfs_btree_init_cur_caches(void)
error = xfs_rtrmapbt_init_cur_cache();
if (error)
goto err;
+ error = xfs_rtrefcountbt_init_cur_cache();
+ if (error)
+ goto err;
return 0;
err:
@@ -5552,6 +5556,7 @@ xfs_btree_destroy_cur_caches(void)
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
xfs_rtrmapbt_destroy_cur_cache();
+ xfs_rtrefcountbt_destroy_cur_cache();
}
/* Move the btree cursor before the first record. */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index dbc047b2fb2c..355b304696e6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -297,7 +297,7 @@ struct xfs_btree_cur
struct {
unsigned int nr_ops; /* # record updates */
unsigned int shape_changes; /* # of extent splits */
- } bc_refc; /* refcountbt */
+ } bc_refc; /* refcountbt/rtrefcountbt */
};
/* Must be at the end of the struct! */
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 1e2477eaa5a8..9effd95ddcd4 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -68,6 +68,7 @@ struct xfs_defer_op_type {
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fba4e59aded4..b1007fb661ba 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -858,6 +858,7 @@ enum xfs_metafile_type {
XFS_METAFILE_RTBITMAP, /* rt bitmap */
XFS_METAFILE_RTSUMMARY, /* rt summary */
XFS_METAFILE_RTRMAP, /* rt rmap */
+ XFS_METAFILE_RTREFCOUNT, /* rt refcount */
XFS_METAFILE_MAX
} __packed;
@@ -870,7 +871,8 @@ enum xfs_metafile_type {
{ XFS_METAFILE_PRJQUOTA, "prjquota" }, \
{ XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
{ XFS_METAFILE_RTSUMMARY, "rtsummary" }, \
- { XFS_METAFILE_RTRMAP, "rtrmap" }
+ { XFS_METAFILE_RTRMAP, "rtrmap" }, \
+ { XFS_METAFILE_RTREFCOUNT, "rtrefcount" }
/*
* On-disk inode structure.
@@ -1790,12 +1792,29 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-#define MAXREFCOUNT ((xfs_nlink_t)~0U)
-#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
+#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U)
+#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U)
/* btree pointer type */
typedef __be32 xfs_refcount_ptr_t;
+/*
+ * Realtime Reference Count btree format definitions
+ *
+ * This is a btree for reference count records for realtime volumes
+ */
+#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */
+
+/*
+ * rt refcount root header, on-disk form only.
+ */
+struct xfs_rtrefcount_root {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+};
+
+/* inode-rooted btree pointer type */
+typedef __be64 xfs_rtrefcount_ptr_t;
/*
* BMAP Btree format definitions
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index d42d3a5617e3..2c3171262b44 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -738,9 +738,10 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */
#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */
#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */
+#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 32
+#define XFS_SCRUB_TYPE_NR 33
/*
* This special type code only applies to the vectored scrub implementation.
@@ -831,9 +832,10 @@ struct xfs_scrub_vec_head {
#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */
#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */
#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */
+#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */
/* Number of metapath sm_ino values */
-#define XFS_SCRUB_METAPATH_NR (9)
+#define XFS_SCRUB_METAPATH_NR (10)
/*
* ioctl limits
@@ -996,6 +998,7 @@ struct xfs_rtgroup_geometry {
#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */
#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */
+#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */
/*
* ioctl commands that are used by Linux filesystems
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 5c8a0aff6ba6..b31000f7190c 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -71,6 +71,7 @@ struct xfs_rtgroup;
#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */
#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */
#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */
+#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */
/* Observable health issues for AG metadata. */
#define XFS_SICK_AG_SB (1 << 0) /* superblock */
@@ -117,7 +118,8 @@ struct xfs_rtgroup;
#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \
XFS_SICK_RG_BITMAP | \
XFS_SICK_RG_SUMMARY | \
- XFS_SICK_RG_RMAPBT)
+ XFS_SICK_RG_RMAPBT | \
+ XFS_SICK_RG_REFCNTBT)
#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
XFS_SICK_AG_AGF | \
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 17cb91b89fca..f24fa628fecf 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -456,6 +456,11 @@ xfs_dinode_verify_fork(
if (!xfs_has_rmapbt(mp))
return __this_address;
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ /* same comment about growfs and rmap inodes applies */
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ break;
default:
return __this_address;
}
@@ -743,7 +748,8 @@ xfs_dinode_verify(
return __this_address;
/* don't let reflink and realtime mix */
- if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) &&
+ !xfs_has_rtreflink(mp))
return __this_address;
/* COW extent size hint validation */
@@ -904,11 +910,29 @@ xfs_inode_validate_cowextsize(
bool rt_flag;
bool hint_flag;
uint32_t cowextsize_bytes;
+ uint32_t blocksize_bytes;
rt_flag = (flags & XFS_DIFLAG_REALTIME);
hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
+ /*
+ * Similar to extent size hints, a directory can be configured to
+ * propagate realtime status and a CoW extent size hint to newly
+ * created files even if there is no realtime device, and the hints on
+ * disk can become misaligned if the sysadmin changes the rt extent
+ * size while adding the realtime device.
+ *
+ * Therefore, we can only enforce the rextsize alignment check against
+ * regular realtime files, and rely on callers to decide when alignment
+ * checks are appropriate, and fix things up as needed.
+ */
+
+ if (rt_flag)
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
if (hint_flag && !xfs_has_reflink(mp))
return __this_address;
@@ -922,16 +946,13 @@ xfs_inode_validate_cowextsize(
if (mode && !hint_flag && cowextsize != 0)
return __this_address;
- if (hint_flag && rt_flag)
- return __this_address;
-
- if (cowextsize_bytes % mp->m_sb.sb_blocksize)
+ if (cowextsize_bytes % blocksize_bytes)
return __this_address;
if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
- if (cowextsize > mp->m_sb.sb_agblocks / 2)
+ if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2)
return __this_address;
return NULL;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index d9b3c182cb40..4f99b90add55 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -28,6 +28,7 @@
#include "xfs_health.h"
#include "xfs_symlink_remote.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_ifork_cache;
@@ -272,6 +273,8 @@ xfs_iformat_data_fork(
switch (ip->i_metatype) {
case XFS_METAFILE_RTRMAP:
return xfs_iformat_rtrmap(ip, dip);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xfs_iformat_rtrefcount(ip, dip);
default:
break;
}
@@ -620,6 +623,9 @@ xfs_iflush_fork(
case XFS_METAFILE_RTRMAP:
xfs_iflush_rtrmap(ip, dip);
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ xfs_iflush_rtrefcount(ip, dip);
+ break;
default:
ASSERT(0);
break;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a7e0e479454d..ec7157eaba5f 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -252,6 +252,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */
#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */
#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */
+#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */
+#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -275,7 +277,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \
{ XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \
{ XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \
- { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }
+ { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \
+ { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \
+ { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" }
/*
* Inode Log Item Format definitions.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index abc705aff26d..66c7916fb5cd 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -81,6 +81,8 @@ extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
extern const struct xlog_recover_item_ops xlog_rtrui_item_ops;
extern const struct xlog_recover_item_ops xlog_rtrud_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtcud_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 07e2f5fb3a94..a85ecddaa48e 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -85,6 +85,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48);
XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4);
/*
* m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index bbb86dc9a25c..cebe83f7842a 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -25,6 +25,9 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_refcount_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrefcount_btree.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -128,7 +131,7 @@ xfs_refcount_check_irec(
struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
return __this_address;
if (!xfs_refcount_check_domain(irec))
@@ -138,12 +141,43 @@ xfs_refcount_check_irec(
if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
return __this_address;
- if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
return __this_address;
return NULL;
}
+xfs_failaddr_t
+xfs_rtrefcount_check_irec(
+ struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX)
+ return __this_address;
+
+ if (!xfs_refcount_check_domain(irec))
+ return __this_address;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount))
+ return __this_address;
+
+ if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX)
+ return __this_address;
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_refcount_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec)
+{
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec);
+ return xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+}
+
static inline int
xfs_refcount_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -152,9 +186,15 @@ xfs_refcount_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
+ if (xfs_btree_is_rtrefcount(cur->bc_ops)) {
+ xfs_warn(mp,
+ "RT Refcount BTree record corruption in rtgroup %u detected at %pS!",
+ cur->bc_group->xg_gno, fa);
+ } else {
+ xfs_warn(mp,
"Refcount BTree record corruption in AG %d detected at %pS!",
cur->bc_group->xg_gno, fa);
+ }
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +220,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
+ fa = xfs_refcount_check_btrec(cur, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -853,9 +893,9 @@ xfs_refc_merge_refcount(
const struct xfs_refcount_irec *irec,
enum xfs_refc_adjust_op adjust)
{
- /* Once a record hits MAXREFCOUNT, it is pinned there forever */
- if (irec->rc_refcount == MAXREFCOUNT)
- return MAXREFCOUNT;
+ /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */
+ if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX)
+ return XFS_REFC_REFCOUNT_MAX;
return irec->rc_refcount + adjust;
}
@@ -898,7 +938,7 @@ xfs_refc_want_merge_center(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount + right->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
*ulenp = ulen;
@@ -933,7 +973,7 @@ xfs_refc_want_merge_left(
* hence we need to catch u32 addition overflows here.
*/
ulen += cleft->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -967,7 +1007,7 @@ xfs_refc_want_merge_right(
* hence we need to catch u32 addition overflows here.
*/
ulen += cright->rc_blockcount;
- if (ulen >= MAXREFCEXTLEN)
+ if (ulen >= XFS_REFC_LEN_MAX)
return false;
return true;
@@ -1065,7 +1105,7 @@ xfs_refcount_still_have_space(
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
cur->bc_refc.shape_changes);
- overhead += cur->bc_mp->m_refc_maxlevels;
+ overhead += cur->bc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -1085,6 +1125,22 @@ xfs_refcount_still_have_space(
cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
+/* Schedule an extent free. */
+static int
+xrefc_free_extent(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *rec)
+{
+ unsigned int flags = 0;
+
+ if (xfs_btree_is_rtrefcount(cur->bc_ops))
+ flags |= XFS_FREE_EXTENT_REALTIME;
+
+ return xfs_free_extent_later(cur->bc_tp,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock),
+ rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags);
+}
+
/*
* Adjust the refcounts of middle extents. At this point we should have
* split extents that crossed the adjustment range; merged with adjacent
@@ -1101,7 +1157,6 @@ xfs_refcount_adjust_extents(
struct xfs_refcount_irec ext, tmp;
int error;
int found_rec, found_tmp;
- xfs_fsblock_t fsbno;
/* Merging did all the work already. */
if (*aglen == 0)
@@ -1117,7 +1172,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
@@ -1154,11 +1209,7 @@ xfs_refcount_adjust_extents(
goto out_error;
}
} else {
- fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
- tmp.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &tmp);
if (error)
goto out_error;
}
@@ -1196,7 +1247,7 @@ xfs_refcount_adjust_extents(
* Adjust the reference count and either update the tree
* (incr) or free the blocks (decr).
*/
- if (ext.rc_refcount == MAXREFCOUNT)
+ if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX)
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur, &ext);
@@ -1216,11 +1267,7 @@ xfs_refcount_adjust_extents(
}
goto advloop;
} else {
- fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
- ext.rc_startblock);
- error = xfs_free_extent_later(cur->bc_tp, fsbno,
- ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ error = xrefc_free_extent(cur, &ext);
if (error)
goto out_error;
}
@@ -1417,12 +1464,122 @@ xfs_refcount_finish_one(
}
/*
+ * Set up a continuation a deferred rtrefcount operation by updating the
+ * intent. Checks to make sure we're not going to run off the end of the
+ * rtgroup.
+ */
+static inline int
+xfs_rtrefcount_continue_op(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_intent *ri,
+ xfs_agblock_t new_agbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno,
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno);
+
+ ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount));
+ return 0;
+}
+
+/*
+ * Process one of the deferred realtime refcount operations. We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ */
+int
+xfs_rtrefcount_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = to_rtg(ri->ri_group);
+ struct xfs_btree_cur *rcur = *pcur;
+ int error = 0;
+ xfs_rgblock_t bno;
+ unsigned long nr_ops = 0;
+ int shape_changes = 0;
+
+ bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock);
+
+ trace_xfs_refcount_deferred(mp, ri);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
+ xfs_btree_del_cursor(rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT);
+ *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
+ }
+
+ switch (ri->ri_type) {
+ case XFS_REFCOUNT_INCREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_INCREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_DECREASE:
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_DECREASE);
+ if (error)
+ return error;
+ if (ri->ri_blockcount > 0)
+ error = xfs_rtrefcount_continue_op(rcur, ri, bno);
+ break;
+ case XFS_REFCOUNT_ALLOC_COW:
+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ case XFS_REFCOUNT_FREE_COW:
+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+ if (error)
+ return error;
+ ri->ri_blockcount = 0;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+ if (!error && ri->ri_blockcount > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, ri);
+ return error;
+}
+
+/*
* Record a refcount intent for later processing.
*/
static void
__xfs_refcount_add(
struct xfs_trans *tp,
enum xfs_refcount_intent_type type,
+ bool isrt,
xfs_fsblock_t startblock,
xfs_extlen_t blockcount)
{
@@ -1434,6 +1591,7 @@ __xfs_refcount_add(
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
+ ri->ri_realtime = isrt;
xfs_refcount_defer_add(tp, ri);
}
@@ -1444,12 +1602,13 @@ __xfs_refcount_add(
void
xfs_refcount_increase_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1459,12 +1618,13 @@ xfs_refcount_increase_extent(
void
xfs_refcount_decrease_extent(
struct xfs_trans *tp,
+ bool isrt,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_has_reflink(tp->t_mountp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock,
PREV->br_blockcount);
}
@@ -1666,7 +1826,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
}
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+ ext.rc_startblock = xfs_group_max_blocks(cur->bc_group);
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
ext.rc_domain = XFS_REFC_DOMAIN_COW;
@@ -1820,6 +1980,7 @@ __xfs_refcount_cow_free(
void
xfs_refcount_alloc_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1828,16 +1989,17 @@ xfs_refcount_alloc_cow_extent(
if (!xfs_has_reflink(mp))
return;
- __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len);
/* Add rmap entry */
- xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
+ xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
void
xfs_refcount_free_cow_extent(
struct xfs_trans *tp,
+ bool isrt,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
@@ -1847,8 +2009,8 @@ xfs_refcount_free_cow_extent(
return;
/* Remove rmap entry */
- xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW);
- __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+ xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len);
}
struct xfs_refcount_recovery {
@@ -1877,8 +2039,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) !=
- NULL ||
+ if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
xfs_btree_mark_sick(cur);
@@ -1893,12 +2054,13 @@ xfs_refcount_recover_extent(
/* Find and remove leftover CoW reservations. */
int
xfs_refcount_recover_cow_leftovers(
- struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_group *xg)
{
+ struct xfs_mount *mp = xg->xg_mount;
+ bool isrt = xg->xg_type == XG_TYPE_RTG;
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
- struct xfs_buf *agbp;
+ struct xfs_buf *agbp = NULL;
struct xfs_refcount_recovery *rr, *n;
struct list_head debris;
union xfs_btree_irec low = {
@@ -1911,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers(
xfs_fsblock_t fsb;
int error;
- /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ /* reflink filesystems must not have groups larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG);
BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
- if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
- return -EOPNOTSUPP;
+
+ if (isrt) {
+ if (!xfs_has_rtgroups(mp))
+ return 0;
+ if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS)
+ return -EOPNOTSUPP;
+ } else {
+ if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS)
+ return -EOPNOTSUPP;
+ }
INIT_LIST_HEAD(&debris);
@@ -1932,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
- if (error)
- goto out_trans;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ if (isrt) {
+ xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
+ cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg));
+ } else {
+ error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp);
+ if (error)
+ goto out_trans;
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg));
+ }
/* Find all the leftover CoW staging extents. */
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
xfs_btree_del_cursor(cur, error);
- xfs_trans_brelse(tp, agbp);
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+ else
+ xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
xfs_trans_cancel(tp);
if (error)
goto out_free;
@@ -1954,14 +2133,15 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
/* Free the orphan record */
- fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock);
- xfs_refcount_free_cow_extent(tp, fsb,
+ fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock);
+ xfs_refcount_free_cow_extent(tp, isrt, fsb,
rr->rr_rrec.rc_blockcount);
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ XFS_AG_RESV_NONE,
+ isrt ? XFS_FREE_EXTENT_REALTIME : 0);
if (error)
goto out_trans;
@@ -2026,7 +2206,7 @@ xfs_refcount_query_range_helper(
xfs_failaddr_t fa;
xfs_refcount_btrec_to_irec(rec, &irec);
- fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec);
+ fa = xfs_refcount_check_btrec(cur, &irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 62d78afcf1f3..f2e299a716a4 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -12,6 +12,7 @@ struct xfs_perag;
struct xfs_btree_cur;
struct xfs_bmbt_irec;
struct xfs_refcount_irec;
+struct xfs_rtgroup;
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
@@ -60,6 +61,7 @@ struct xfs_refcount_intent {
enum xfs_refcount_intent_type ri_type;
xfs_extlen_t ri_blockcount;
xfs_fsblock_t ri_startblock;
+ bool ri_realtime;
};
/* Check that the refcount is appropriate for the record domain. */
@@ -74,24 +76,25 @@ xfs_refcount_check_domain(
return true;
}
-void xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt,
struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+int xfs_refcount_finish_one(struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+int xfs_rtrefcount_finish_one(struct xfs_trans *tp,
struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
xfs_extlen_t *flen, bool find_end_of_shared);
-void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
- xfs_extlen_t len);
-extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
- struct xfs_perag *pag);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt,
+ xfs_fsblock_t fsb, xfs_extlen_t len);
+int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg);
/*
* While we're adjusting the refcounts records of an extent, we have
@@ -120,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg,
+ const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f8415fd96cc2..3cdf50563fec 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -285,6 +285,13 @@ xfs_rtrmap_check_meta_irec(
if (irec->rm_blockcount != mp->m_sb.sb_rextsize)
return __this_address;
return NULL;
+ case XFS_RMAP_OWN_COW:
+ if (!xfs_has_rtreflink(mp))
+ return __this_address;
+ if (!xfs_verify_rgbext(rtg, irec->rm_startblock,
+ irec->rm_blockcount))
+ return __this_address;
+ return NULL;
default:
return __this_address;
}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index b7ed2d27d545..a6468e591232 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -34,6 +34,7 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/* Find the first usable fsblock in this rtgroup. */
static inline uint32_t
@@ -206,6 +207,9 @@ xfs_rtgroup_lock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
}
/* Unlock metadata inodes associated with this rt group. */
@@ -218,6 +222,9 @@ xfs_rtgroup_unlock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL);
+
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
@@ -249,6 +256,9 @@ xfs_rtgroup_trans_join(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL);
+
+ if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg))
+ xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL);
}
/* Retrieve rt group geometry. */
@@ -367,6 +377,15 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.enabled = xfs_has_rmapbt,
.create = xfs_rtrmapbt_create,
},
+ [XFS_RTGI_REFCOUNT] = {
+ .name = "refcount",
+ .metafile_type = XFS_METAFILE_RTREFCOUNT,
+ .sick = XFS_SICK_RG_REFCNTBT,
+ .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE,
+ /* same comment about growfs and rmap inodes applies here */
+ .enabled = xfs_has_reflink,
+ .create = xfs_rtrefcountbt_create,
+ },
};
/* Return the shortname of this rtgroup inode. */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 6ff222a05367..03f39d4e43fc 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -15,6 +15,7 @@ enum xfs_rtg_inodes {
XFS_RTGI_BITMAP, /* allocation bitmap */
XFS_RTGI_SUMMARY, /* allocation summary */
XFS_RTGI_RMAP, /* rmap btree inode */
+ XFS_RTGI_REFCOUNT, /* refcount btree inode */
XFS_RTGI_MAX,
};
@@ -80,6 +81,11 @@ static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg)
return rtg->rtg_inodes[XFS_RTGI_RMAP];
}
+static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_inodes[XFS_RTGI_REFCOUNT];
+}
+
/* Passive rtgroup references */
static inline struct xfs_rtgroup *
xfs_rtgroup_get(
@@ -267,10 +273,13 @@ int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1)
/* Lock the rt rmap inode in exclusive mode */
#define XFS_RTGLOCK_RMAP (1U << 2)
+/* Lock the rt refcount inode in exclusive mode */
+#define XFS_RTGLOCK_REFCOUNT (1U << 3)
#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \
XFS_RTGLOCK_BITMAP_SHARED | \
- XFS_RTGLOCK_RMAP)
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
new file mode 100644
index 000000000000..3db5e7a4a945
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_health.h"
+
+static struct kmem_cache *xfs_rtrefcountbt_cur_cache;
+
+/*
+ * Realtime Reference Count btree.
+ *
+ * This is a btree used to track the owner(s) of a given extent in the realtime
+ * device. See the comments in xfs_refcount_btree.c for more information.
+ *
+ * This tree is basically the same as the regular refcount btree except that
+ * it's rooted in an inode.
+ */
+
+static struct xfs_btree_cur *
+xfs_rtrefcountbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
+}
+
+STATIC int
+xfs_rtrefcountbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_rtrefc_mnr[level != 0];
+}
+
+STATIC int
+xfs_rtrefcountbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+
+ return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
+ level == 0);
+ }
+
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+}
+
+/*
+ * Calculate number of records in a realtime refcount btree inode root.
+ */
+unsigned int
+xfs_rtrefcountbt_droot_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= sizeof(struct xfs_rtrefcount_root);
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (2 * sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork so that
+ * we can resize the in-memory buffer to match it. After a resize to the
+ * maximum size this function returns the same value as
+ * xfs_rtrefcountbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_rtrefcountbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_rtrefc_mxr[level != 0];
+ return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_rtrefcountbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->refc.rc_startblock);
+ x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+ key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
+ rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+ rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_rtrefcountbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+}
+
+STATIC int64_t
+xfs_rtrefcountbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+static xfs_failaddr_t
+xfs_rtrefcountbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ int level;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (!xfs_has_reflink(mp))
+ return __this_address;
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+ level = be16_to_cpu(block->bb_level);
+ if (level > mp->m_rtrefc_maxlevels)
+ return __this_address;
+
+ return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]);
+}
+
+static void
+xfs_rtrefcountbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_btree_fsblock_verify_crc(bp))
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
+
+ if (bp->b_error)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+}
+
+static void
+xfs_rtrefcountbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtrefcountbt_verify(bp);
+ if (fa) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+ xfs_btree_fsblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = {
+ .name = "xfs_rtrefcountbt",
+ .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) },
+ .verify_read = xfs_rtrefcountbt_read_verify,
+ .verify_write = xfs_rtrefcountbt_write_verify,
+ .verify_struct = xfs_rtrefcountbt_verify,
+};
+
+STATIC int
+xfs_rtrefcountbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->refc.rc_startblock) <
+ be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_rtrefcountbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->refc.rc_startblock) +
+ be32_to_cpu(r1->refc.rc_blockcount) <=
+ be32_to_cpu(r2->refc.rc_startblock);
+}
+
+STATIC enum xbtree_key_contig
+xfs_rtrefcountbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+ be32_to_cpu(key2->refc.rc_startblock));
+}
+
+static inline void
+xfs_rtrefcountbt_move_ptrs(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *broot,
+ short old_size,
+ size_t new_size,
+ unsigned int numrecs)
+{
+ void *dptr;
+ void *sptr;
+
+ sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size);
+ dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size);
+ memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+static struct xfs_btree_block *
+xfs_rtrefcountbt_broot_realloc(
+ struct xfs_btree_cur *cur,
+ unsigned int new_numrecs)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ struct xfs_btree_block *broot;
+ unsigned int new_size;
+ unsigned int old_size = ifp->if_broot_bytes;
+ const unsigned int level = cur->bc_nlevels - 1;
+
+ new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs);
+
+ /* Handle the nop case quietly. */
+ if (new_size == old_size)
+ return ifp->if_broot;
+
+ if (new_size > old_size) {
+ unsigned int old_numrecs;
+
+ /*
+ * If there wasn't any memory allocated before, just allocate
+ * it now and get out.
+ */
+ if (old_size == 0)
+ return xfs_broot_realloc(ifp, new_size);
+
+ /*
+ * If there is already an existing if_broot, then we need to
+ * realloc it and possibly move the node block pointers because
+ * those are not butted up against the btree block header.
+ */
+ old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level);
+ broot = xfs_broot_realloc(ifp, new_size);
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, broot, old_size,
+ new_size, old_numrecs);
+ goto out_broot;
+ }
+
+ /*
+ * We're reducing numrecs. If we're going all the way to zero, just
+ * free the block.
+ */
+ ASSERT(ifp->if_broot != NULL && old_size > 0);
+ if (new_size == 0)
+ return xfs_broot_realloc(ifp, 0);
+
+ /*
+ * Shrink the btree root by possibly moving the rtrmapbt pointers,
+ * since they are not butted up against the btree block header. Then
+ * reallocate broot.
+ */
+ if (level > 0)
+ xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size,
+ new_size, new_numrecs);
+ broot = xfs_broot_realloc(ifp, new_size);
+
+out_broot:
+ ASSERT(xfs_rtrefcount_droot_space(broot) <=
+ xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
+ return broot;
+}
+
+const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
+ .name = "rtrefcount",
+ .type = XFS_BTREE_TYPE_INODE,
+ .geom_flags = XFS_BTGEO_IROOT_RECORDS,
+
+ .rec_len = sizeof(struct xfs_refcount_rec),
+ .key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2),
+ .sick_mask = XFS_SICK_RG_REFCNTBT,
+
+ .dup_cursor = xfs_rtrefcountbt_dup_cursor,
+ .alloc_block = xfs_btree_alloc_metafile_block,
+ .free_block = xfs_btree_free_metafile_block,
+ .get_minrecs = xfs_rtrefcountbt_get_minrecs,
+ .get_maxrecs = xfs_rtrefcountbt_get_maxrecs,
+ .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
+ .key_diff = xfs_rtrefcountbt_key_diff,
+ .buf_ops = &xfs_rtrefcountbt_buf_ops,
+ .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .keys_inorder = xfs_rtrefcountbt_keys_inorder,
+ .recs_inorder = xfs_rtrefcountbt_recs_inorder,
+ .keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
+ .broot_realloc = xfs_rtrefcountbt_broot_realloc,
+};
+
+/* Allocate a new rt refcount btree cursor. */
+struct xfs_btree_cur *
+xfs_rtrefcountbt_init_cursor(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_inode *ip = rtg_refcount(rtg);
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_btree_cur *cur;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops,
+ mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache);
+
+ cur->bc_ino.ip = ip;
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
+ cur->bc_group = xfs_group_hold(rtg_group(rtg));
+ cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
+ cur->bc_ino.whichfork = XFS_DATA_FORK;
+ return cur;
+}
+
+/*
+ * Install a new rt reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rtrefcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
+}
+
+/* Calculate number of records in a realtime refcount btree block. */
+static inline unsigned int
+xfs_rtrefcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount btree block.
+ */
+unsigned int
+xfs_rtrefcountbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= XFS_RTREFCOUNT_BLOCK_LEN;
+ return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for realtime refcount btrees. */
+unsigned int
+xfs_rtrefcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2;
+
+ /* We need at most one record for every block in an rt group. */
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+}
+
+int __init
+xfs_rtrefcountbt_init_cur_cache(void)
+{
+ xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur",
+ xfs_btree_cur_sizeof(
+ xfs_rtrefcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rtrefcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rtrefcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rtrefcountbt_cur_cache);
+ xfs_rtrefcountbt_cur_cache = NULL;
+}
+
+/* Compute the maximum height of a realtime refcount btree. */
+void
+xfs_rtrefcountbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int d_maxlevels, r_maxlevels;
+
+ if (!xfs_has_rtreflink(mp)) {
+ mp->m_rtrefc_maxlevels = 0;
+ return;
+ }
+
+ /*
+ * The realtime refcountbt lives on the data device, which means that
+ * its maximum height is constrained by the size of the data device and
+ * the height required to store one refcount record for each rtextent
+ * in an rt group.
+ */
+ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_dblocks);
+ r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr,
+ mp->m_sb.sb_rgextents);
+
+ /* Add one level to handle the inode root level. */
+ mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
+}
+
+/* Calculate the rtrefcount btree size for some records. */
+unsigned long long
+xfs_rtrefcountbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_rtrefc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+static unsigned long long
+xfs_rtrefcountbt_max_size(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtblocks)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_rtrefc_mxr[0] == 0)
+ return 0;
+
+ return xfs_rtrefcountbt_calc_size(mp, rtblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ * We need enough space to hold one record for every rt extent in the rtgroup.
+ */
+xfs_filblks_t
+xfs_rtrefcountbt_calc_reserves(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents);
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+STATIC void
+xfs_rtrefcountbt_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen,
+ struct xfs_btree_block *rblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int numrecs;
+ unsigned int maxrecs;
+ unsigned int rblocklen;
+
+ rblocklen = xfs_rtrefcount_broot_space(mp, dblock);
+
+ xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+
+ rblock->bb_level = dblock->bb_level;
+ rblock->bb_numrecs = dblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ tkp = xfs_rtrefcount_key_addr(rblock, 1);
+ fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ trp = xfs_rtrefcount_rec_addr(rblock, 1);
+ numrecs = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Load a realtime reference count btree root in from disk. */
+int
+xfs_iformat_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ struct xfs_btree_block *broot;
+ unsigned int numrecs;
+ unsigned int level;
+ int dsize;
+
+ /*
+ * growfs must create the rtrefcount inodes before adding a realtime
+ * volume to the filesystem, so we cannot use the rtrefcount predicate
+ * here.
+ */
+ if (!xfs_has_reflink(ip->i_mount)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
+ numrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > mp->m_rtrefc_maxlevels ||
+ xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
+ xfs_rtrefcount_broot_space_calc(mp, level, numrecs));
+ if (broot)
+ xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot);
+ return 0;
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_rtrefcountbt_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ int rblocklen,
+ struct xfs_rtrefcount_root *dblock,
+ int dblocklen)
+{
+ struct xfs_refcount_key *fkp;
+ __be64 *fpp;
+ struct xfs_refcount_key *tkp;
+ __be64 *tpp;
+ struct xfs_refcount_rec *frp;
+ struct xfs_refcount_rec *trp;
+ unsigned int maxrecs;
+ unsigned int numrecs;
+
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
+
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+
+ if (be16_to_cpu(rblock->bb_level) > 0) {
+ maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false);
+ fkp = xfs_rtrefcount_key_addr(rblock, 1);
+ tkp = xfs_rtrefcount_droot_key_addr(dblock, 1);
+ fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
+ memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
+ } else {
+ frp = xfs_rtrefcount_rec_addr(rblock, 1);
+ trp = xfs_rtrefcount_droot_rec_addr(dblock, 1);
+ numrecs = be16_to_cpu(rblock->bb_numrecs);
+ memcpy(trp, frp, sizeof(*frp) * numrecs);
+ }
+}
+
+/* Flush a realtime reference count btree root out to disk. */
+void
+xfs_iflush_rtrefcount(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes > 0);
+ ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <=
+ xfs_inode_fork_size(ip, XFS_DATA_FORK));
+ xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot,
+ ifp->if_broot_bytes, dfp,
+ XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
+}
+
+/*
+ * Create a realtime refcount btree inode.
+ */
+int
+xfs_rtrefcountbt_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *broot;
+
+ ifp->if_format = XFS_DINODE_FMT_META_BTREE;
+ ASSERT(ifp->if_broot_bytes == 0);
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Initialize the empty incore btree root. */
+ broot = xfs_broot_realloc(ifp,
+ xfs_rtrefcount_broot_space_calc(mp, 0, 0));
+ if (broot)
+ xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0,
+ ip->i_ino);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
new file mode 100644
index 000000000000..a99b7a8aec86
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_RTREFCOUNT_BTREE_H__
+#define __XFS_RTREFCOUNT_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xbtree_ifakeroot;
+struct xfs_rtgroup;
+
+/* refcounts only exist on crc enabled filesystems */
+#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg);
+struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake);
+void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp);
+unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp,
+ unsigned int blocklen, bool leaf);
+void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp);
+unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf);
+
+/*
+ * Addresses of records, keys, and pointers within an incore rtrefcountbt block.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_rec_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_key_addr(
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_ptr_addr(
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void);
+int __init xfs_rtrefcountbt_init_cur_cache(void);
+void xfs_rtrefcountbt_destroy_cur_cache(void);
+
+xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp);
+unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
+/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */
+
+static inline struct xfs_refcount_rec *
+xfs_rtrefcount_droot_rec_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_rec));
+}
+
+static inline struct xfs_refcount_key *
+xfs_rtrefcount_droot_key_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index)
+{
+ return (struct xfs_refcount_key *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_refcount_key));
+}
+
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_droot_ptr_addr(
+ struct xfs_rtrefcount_root *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_rtrefcount_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * sizeof(struct xfs_refcount_key) +
+ (index - 1) * sizeof(xfs_rtrefcount_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_rtrefcount_ptr_t *
+xfs_rtrefcount_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int index,
+ unsigned int block_size)
+{
+ return xfs_rtrefcount_ptr_addr(bb, index,
+ xfs_rtrefcountbt_maxrecs(mp, block_size, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = XFS_RTREFCOUNT_BLOCK_LEN;
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb)
+{
+ return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_rtrefcount_droot_space_calc(
+ unsigned int level,
+ unsigned int nrecs)
+{
+ size_t sz = sizeof(struct xfs_rtrefcount_root);
+
+ if (level > 0)
+ return sz + nrecs * (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_rtrefcount_ptr_t));
+ return sz + nrecs * sizeof(struct xfs_refcount_rec);
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_rtrefcount_droot_space(struct xfs_btree_block *bb)
+{
+ return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level),
+ be16_to_cpu(bb->bb_numrecs));
+}
+
+int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp,
+ struct xfs_btree_block *rblock, int rblocklen,
+ struct xfs_rtrefcount_root *dblock, int dblocklen);
+void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip);
+
+int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+
+#endif /* __XFS_RTREFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index b90901e39e92..e4ec36943cb7 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -718,6 +718,7 @@ xfs_rtrmapbt_maxrecs(
unsigned int
xfs_rtrmapbt_maxlevels_ondisk(void)
{
+ unsigned long long max_dblocks;
unsigned int minrecs[2];
unsigned int blocklen;
@@ -726,8 +727,20 @@ xfs_rtrmapbt_maxlevels_ondisk(void)
minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
- /* We need at most one record for every block in an rt group. */
- return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
+ /*
+ * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
+ *
+ * On a reflink filesystem, each block in an rtgroup can have up to
+ * 2^32 (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records. However, we're
+ * likely to run out of blocks in the data device long before that
+ * happens, which means that we must compute the max height based on
+ * what the btree will look like if it consumes almost all the blocks
+ * in the data device due to maximal sharing factor.
+ */
+ max_dblocks = -1U; /* max ag count */
+ max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
+ return xfs_btree_space_to_height(minrecs, max_dblocks);
}
int __init
@@ -766,9 +779,20 @@ xfs_rtrmapbt_compute_maxlevels(
* maximum height is constrained by the size of the data device and
* the height required to store one rmap record for each block in an
* rt group.
+ *
+ * On a reflink filesystem, each rt block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. This makes the computation of
+ * maxlevels based on record count meaningless, so we only consider the
+ * size of the data device.
*/
d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
mp->m_sb.sb_dblocks);
+ if (xfs_has_rtreflink(mp)) {
+ mp->m_rtrmap_maxlevels = d_maxlevels + 1;
+ return;
+ }
+
r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
mp->m_groups[XG_TYPE_RTG].blocks);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 83fb14b4074c..3dc5f5dba162 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -29,6 +29,7 @@
#include "xfs_exchrange.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1226,6 +1227,13 @@ xfs_sb_mount_common(
mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+ mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ true);
+ mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize,
+ false);
+ mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2;
+ mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 960716c387cc..b1e0d9bc1f7d 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
+extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
@@ -58,6 +59,7 @@ extern const struct xfs_btree_ops xfs_rmapbt_ops;
extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
extern const struct xfs_btree_ops xfs_rtrmapbt_ops;
extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops;
+extern const struct xfs_btree_ops xfs_rtrefcountbt_ops;
static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
{
@@ -114,6 +116,11 @@ static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops)
return ops == &xfs_rtrmapbt_ops;
}
+static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rtrefcountbt_ops;
+}
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f3392eb2d7f4..13d00c7166e1 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -92,6 +92,14 @@ xfs_refcountbt_block_count(
return num_ops * (2 * mp->m_refc_maxlevels - 1);
}
+static unsigned int
+xfs_rtrefcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_rtrefc_maxlevels - 1);
+}
+
/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
@@ -259,10 +267,13 @@ xfs_rtalloc_block_count(
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
*
- * This is calculated as:
+ * This is calculated as the max of:
* Data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
static unsigned int
xfs_calc_refcountbt_reservation(
@@ -270,12 +281,20 @@ xfs_calc_refcountbt_reservation(
unsigned int nr_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+ unsigned int t1, t2 = 0;
if (!xfs_has_reflink(mp))
return 0;
- return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+ t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+
+ if (xfs_has_realtime(mp))
+ t2 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ blksz);
+
+ return max(t1, t2);
}
/*
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index b45d2b32051a..cd6f0223879f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -647,7 +647,7 @@ xrep_agfl_fill(
xfs_agblock_t agbno = start;
int error;
- trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
+ trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len);
while (agbno < start + len && af->fl_off < af->flcount)
af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f6077b0cba8a..66da7d4d56ba 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -347,13 +347,31 @@ xchk_bmap_rt_iextent_xref(
goto out_cur;
rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock);
- xchk_bmap_xref_rmap(info, irec, rgbno);
-
- xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork,
- irec->br_startoff);
- xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
- irec->br_blockcount, &oinfo);
+ switch (info->whichfork) {
+ case XFS_DATA_FORK:
+ xchk_bmap_xref_rmap(info, irec, rgbno);
+ if (!xfs_is_reflink_inode(info->sc->ip)) {
+ xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+ info->whichfork, irec->br_startoff);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &oinfo);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ }
+ xchk_xref_is_not_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ case XFS_COW_FORK:
+ xchk_bmap_xref_rmap_cow(info, irec, rgbno);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &XFS_RMAP_OINFO_COW);
+ xchk_xref_is_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ }
out_cur:
xchk_rtgroup_btcur_free(&info->sc->sr);
out_free:
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index fd64bdf4e138..1084213b8e9b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -101,14 +101,21 @@ xrep_bmap_discover_shared(
xfs_filblks_t blockcount)
{
struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *cur;
xfs_agblock_t agbno;
xfs_agblock_t fbno;
xfs_extlen_t flen;
int error;
- agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
- error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
- &fbno, &flen, false);
+ if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ agbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+ cur = sc->sr.refc_cur;
+ } else {
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ cur = sc->sa.refc_cur;
+ }
+ error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen,
+ false);
if (error)
return error;
@@ -450,7 +457,9 @@ xrep_bmap_scan_rtgroup(
return 0;
error = xrep_rtgroup_init(sc, rtg, &sc->sr,
- XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED);
+ XFS_RTGLOCK_RMAP |
+ XFS_RTGLOCK_REFCOUNT |
+ XFS_RTGLOCK_BITMAP_SHARED);
if (error)
return error;
@@ -903,10 +912,6 @@ xrep_bmap_init_reflink_scan(
if (whichfork != XFS_DATA_FORK)
return RLS_IRRELEVANT;
- /* cannot share realtime extents */
- if (XFS_IS_REALTIME_INODE(sc->ip))
- return RLS_IRRELEVANT;
-
return RLS_UNKNOWN;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 06cb61e63498..28ad341df8ee 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -37,6 +37,7 @@
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_bmap_util.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -797,6 +798,9 @@ xchk_rtgroup_lock(
if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+ if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+
return 0;
}
@@ -811,7 +815,10 @@ xchk_rtgroup_btcur_free(
{
if (sr->rmap_cur)
xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+ if (sr->refc_cur)
+ xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR);
+ sr->refc_cur = NULL;
sr->rmap_cur = NULL;
}
@@ -1687,6 +1694,9 @@ xchk_meta_btree_count_blocks(
case XFS_METAFILE_RTRMAP:
cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg);
+ break;
default:
ASSERT(0);
return -EFSCORRUPTED;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 50ac6cca18fe..bdcd40f0ec74 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -82,11 +82,13 @@ int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
+int xchk_setup_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xchk_setup_rtbitmap xchk_setup_nothing
# define xchk_setup_rtsummary xchk_setup_nothing
# define xchk_setup_rgsuperblock xchk_setup_nothing
# define xchk_setup_rtrmapbt xchk_setup_nothing
+# define xchk_setup_rtrefcountbt xchk_setup_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -129,7 +131,8 @@ xchk_ag_init_existing(
/* All the locks we need to check an rtgroup. */
#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
- XFS_RTGLOCK_RMAP)
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
struct xchk_rt *sr);
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 5b6194cef3e5..38a246b8bf11 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -26,6 +26,9 @@
#include "xfs_errortag.h"
#include "xfs_icache.h"
#include "xfs_refcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -34,6 +37,7 @@
#include "scrub/bitmap.h"
#include "scrub/off_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -61,7 +65,10 @@ struct xrep_cow {
struct xoff_bitmap bad_fileoffs;
/* Bitmap of fsblocks that were removed from the CoW fork. */
- struct xfsb_bitmap old_cowfork_fsblocks;
+ union {
+ struct xfsb_bitmap old_cowfork_fsblocks;
+ struct xrtb_bitmap old_cowfork_rtblocks;
+ };
/* CoW fork mappings used to scan for bad CoW staging extents. */
struct xfs_bmbt_irec irec;
@@ -145,8 +152,7 @@ xrep_cow_mark_shared_staging(
xrep_cow_trim_refcount(xc, &rrec, rec);
return xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group),
- rrec.rc_startblock),
+ xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock),
rrec.rc_blockcount);
}
@@ -177,9 +183,8 @@ xrep_cow_mark_missing_staging(
if (xc->next_bno >= rrec.rc_startblock)
goto next;
-
error = xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno),
+ xfs_gbno_to_fsb(cur->bc_group, xc->next_bno),
rrec.rc_startblock - xc->next_bno);
if (error)
return error;
@@ -222,8 +227,7 @@ xrep_cow_mark_missing_staging_rmap(
}
return xrep_cow_mark_file_range(xc,
- xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno),
- rec_len);
+ xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len);
}
/*
@@ -311,6 +315,92 @@ out_pag:
}
/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad_rt(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_scrub *sc = xc->sc;
+ struct xfs_rtgroup *rtg;
+ int error = 0;
+
+ xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock);
+
+ rtg = xfs_rtgroup_get(sc->mp,
+ xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock));
+ if (!rtg)
+ return -EFSCORRUPTED;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+ XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+ if (error)
+ goto out_rtg;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sr;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sr;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ xfs_rgbno_to_rtb(rtg, xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sr;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sr;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone
+ * turned on the debugging knob, replace everything in the
+ * CoW fork and then scan for staging extents in the refcountbt.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ goto out_rtg;
+ }
+
+out_sr:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+out_rtg:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+
+/*
* Allocate a replacement CoW staging extent of up to the given number of
* blocks, and fill out the mapping.
*/
@@ -343,7 +433,7 @@ xrep_cow_alloc(
if (args.fsbno == NULLFSBLOCK)
return -ENOSPC;
- xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+ xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len);
repl->fsbno = args.fsbno;
repl->len = args.len;
@@ -351,6 +441,32 @@ xrep_cow_alloc(
}
/*
+ * Allocate a replacement rt CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc_rt(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen);
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, 0, maxrtx);
+ if (error)
+ return error;
+
+ error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false,
+ false, &repl->fsbno, &repl->len);
+ if (error)
+ return error;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len);
+ return 0;
+}
+
+/*
* Look up the current CoW fork mapping so that we only allocate enough to
* replace a single mapping. If we don't find a mapping that covers the start
* of the file range, or we find a delalloc or written extent, something is
@@ -467,7 +583,10 @@ xrep_cow_replace_range(
*/
alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
nextoff - startoff);
- error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_alloc_rt(sc, alloc_len, &repl);
+ else
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
if (error)
return error;
@@ -483,8 +602,12 @@ xrep_cow_replace_range(
return error;
/* Note the old CoW staging extents; we'll reap them all later. */
- error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
- repl.len);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks,
+ got.br_startblock, repl.len);
+ else
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks,
+ got.br_startblock, repl.len);
if (error)
return error;
@@ -540,8 +663,16 @@ xrep_bmap_cow(
if (!ifp)
return 0;
- /* realtime files aren't supported yet */
- if (XFS_IS_REALTIME_INODE(sc->ip))
+ /*
+ * Realtime files with large extent sizes are not supported because
+ * we could encounter an CoW mapping that has been partially written
+ * out *and* requires replacement, and there's no solution to that.
+ */
+ if (xfs_inode_has_bigrtalloc(sc->ip))
+ return -EOPNOTSUPP;
+
+ /* Metadata inodes aren't supposed to have data on the rt volume. */
+ if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip))
return -EOPNOTSUPP;
/*
@@ -562,7 +693,10 @@ xrep_bmap_cow(
xc->sc = sc;
xoff_bitmap_init(&xc->bad_fileoffs);
- xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_init(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
for_each_xfs_iext(ifp, &icur, &xc->irec) {
if (xchk_should_terminate(sc, &error))
@@ -585,7 +719,10 @@ xrep_bmap_cow(
if (xfs_bmap_is_written_extent(&xc->irec))
continue;
- error = xrep_cow_find_bad(xc);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_find_bad_rt(xc);
+ else
+ error = xrep_cow_find_bad(xc);
if (error)
goto out_bitmap;
}
@@ -600,13 +737,20 @@ xrep_bmap_cow(
* by the refcount btree, not the inode, so it is correct to treat them
* like inode metadata.
*/
- error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
- &XFS_RMAP_OINFO_COW);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks,
+ &XFS_RMAP_OINFO_COW);
+ else
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
if (error)
goto out_bitmap;
out_bitmap:
- xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
kfree(xc);
return error;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index bcc4244e3b55..3c0f25098b69 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -115,6 +115,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH },
[XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER },
[XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT },
+ [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT },
};
/* Return the health status mask for this scrub type. */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 8e702121dc86..db6edd5a5fe5 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -260,12 +260,7 @@ xchk_inode_extsize(
xchk_ino_set_warning(sc, ino);
}
-/*
- * Validate di_cowextsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_cowextsize hint. */
STATIC void
xchk_inode_cowextsize(
struct xfs_scrub *sc,
@@ -276,12 +271,25 @@ xchk_inode_cowextsize(
uint64_t flags2)
{
xfs_failaddr_t fa;
+ uint32_t value = be32_to_cpu(dip->di_cowextsize);
- fa = xfs_inode_validate_cowextsize(sc->mp,
- be32_to_cpu(dip->di_cowextsize), mode, flags,
- flags2);
+ fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
+
+ /*
+ * XFS allows a sysadmin to change the rt extent size when adding a rt
+ * section to a filesystem after formatting. If there are any
+ * directories with cowextsize and rtinherit set, the hint could become
+ * misaligned with the new rextsize. The verifier doesn't check this,
+ * because we allow rtinherit directories even without an rt device.
+ * Flag this as an administrative warning since we will clean this up
+ * eventually.
+ */
+ if ((flags & XFS_DIFLAG_RTINHERIT) &&
+ (flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ value % sc->mp->m_sb.sb_rextsize > 0)
+ xchk_ino_set_warning(sc, ino);
}
/* Make sure the di_flags make sense for the inode. */
@@ -360,8 +368,9 @@ xchk_inode_flags2(
if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
goto bad;
- /* realtime and reflink make no sense, currently */
- if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ /* realtime and reflink don't always go together */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_has_rtreflink(mp))
goto bad;
/* no bigtime iflag without the bigtime feature */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index d7e3f033b160..2f641b6d663e 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -40,6 +40,7 @@
#include "xfs_symlink_remote.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -564,8 +565,6 @@ xrep_dinode_flags(
flags2 |= XFS_DIFLAG2_REFLINK;
else
flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
- if (flags & XFS_DIFLAG_REALTIME)
- flags2 &= ~XFS_DIFLAG2_REFLINK;
if (!xfs_has_bigtime(mp))
flags2 &= ~XFS_DIFLAG2_BIGTIME;
if (!xfs_has_large_extent_counts(mp))
@@ -972,6 +971,34 @@ xrep_dinode_bad_rtrmapbt_fork(
return false;
}
+/* Return true if this refcount-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrefcountbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size)
+{
+ struct xfs_rtrefcount_root *dfp;
+ unsigned int nrecs;
+ unsigned int level;
+
+ if (dfork_size < sizeof(struct xfs_rtrefcount_root))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > sc->mp->m_rtrefc_maxlevels)
+ return true;
+ if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size)
+ return true;
+ if (level > 0 && nrecs == 0)
+ return true;
+
+ return false;
+}
+
/* Check a metadata-btree fork. */
STATIC bool
xrep_dinode_bad_metabt_fork(
@@ -986,6 +1013,8 @@ xrep_dinode_bad_metabt_fork(
switch (be16_to_cpu(dip->di_metatype)) {
case XFS_METAFILE_RTRMAP:
return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size);
default:
return true;
}
@@ -1251,6 +1280,7 @@ xrep_dinode_ensure_forkoff(
{
struct xfs_bmdr_block *bmdr;
struct xfs_rtrmap_root *rmdr;
+ struct xfs_rtrefcount_root *rcdr;
struct xfs_scrub *sc = ri->sc;
xfs_extnum_t attr_extents, data_extents;
size_t bmdr_minsz = xfs_bmdr_space_calc(1);
@@ -1363,6 +1393,10 @@ xrep_dinode_ensure_forkoff(
rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr);
+ break;
default:
dfork_min = 0;
break;
@@ -1790,10 +1824,6 @@ xrep_inode_flags(
/* DAX only applies to files and dirs. */
if (!(S_ISREG(mode) || S_ISDIR(mode)))
sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
-
- /* No reflink files on the realtime device. */
- if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
- sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
}
/*
@@ -1909,6 +1939,20 @@ xrep_inode_pptr(
sizeof(struct xfs_attr_sf_hdr), true);
}
+/* Fix COW extent size hint problems. */
+STATIC void
+xrep_inode_cowextsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned CoW extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+ sc->ip->i_cowextsize = 0;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ }
+}
+
/* Fix any irregularities in an inode that the verifiers don't catch. */
STATIC int
xrep_inode_problems(
@@ -1932,6 +1976,7 @@ xrep_inode_problems(
if (S_ISDIR(VFS_I(sc->ip)->i_mode))
xrep_inode_dir_size(sc);
xrep_inode_extsize(sc);
+ xrep_inode_cowextsize(sc);
trace_xrep_inode_fixed(sc);
xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 74d71373e7ed..e21c16fbd15d 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -22,6 +22,7 @@
#include "xfs_attr.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -249,6 +250,8 @@ xchk_setup_metapath(
return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
case XFS_SCRUB_METAPATH_RTRMAPBT:
return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP);
+ case XFS_SCRUB_METAPATH_RTREFCOUNTBT:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT);
default:
return -ENOENT;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 183d531875ea..58d6d4ed2853 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -212,12 +212,18 @@ xchk_quota_item(
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
} else {
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
}
- if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+ if (dq->q_ino.count > fs_icount)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/*
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index cd51f10f2920..8f4c8d41f308 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -233,7 +233,7 @@ xrep_quota_item(
rqi->need_quotacheck = true;
dirty = true;
}
- if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) {
dq->q_rtb.reserved -= dq->q_rtb.count;
dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
dq->q_rtb.count = mp->m_sb.sb_rblocks;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 4d7f1b82dc55..b32fb233cf84 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -34,6 +34,8 @@
#include "xfs_attr_remote.h"
#include "xfs_defer.h"
#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -41,6 +43,7 @@
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -311,7 +314,7 @@ xreap_agextent_binval(
}
out:
- trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
}
/*
@@ -370,7 +373,8 @@ xreap_agextent_select(
out_found:
*aglenp = len;
- trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+ trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
out_cur:
xfs_btree_del_cursor(cur, error);
return error;
@@ -409,7 +413,8 @@ xreap_agextent_iter(
* to run xfs_repair.
*/
if (crosslinked) {
- trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
+ *aglenp);
rs->force_roll = true;
@@ -419,7 +424,8 @@ xreap_agextent_iter(
* records from the refcountbt, which will remove the
* rmap record as well.
*/
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
+ *aglenp);
return 0;
}
@@ -427,7 +433,7 @@ xreap_agextent_iter(
*aglenp, rs->oinfo);
}
- trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
/*
* Invalidate as many buffers as we can, starting at agbno. If this
@@ -451,7 +457,7 @@ xreap_agextent_iter(
if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
ASSERT(rs->resv == XFS_AG_RESV_NONE);
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
@@ -678,6 +684,225 @@ xrep_reap_fsblocks(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call. Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed. Units are rt blocks, not rt extents.
+ */
+STATIC int
+xreap_rgextent_select(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_rgblock_t rgbno_next,
+ bool *crosslinked,
+ xfs_extlen_t *rglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_btree_cur *cur;
+ xfs_rgblock_t bno = rgbno + 1;
+ xfs_extlen_t len = 1;
+ int error;
+
+ /*
+ * Determine if there are any other rmap records covering the first
+ * block of this extent. If so, the block is crosslinked.
+ */
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+ error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
+ crosslinked);
+ if (error)
+ goto out_cur;
+
+ /*
+ * Figure out how many of the subsequent blocks have the same crosslink
+ * status.
+ */
+ while (bno < rgbno_next) {
+ bool also_crosslinked;
+
+ error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (*crosslinked != also_crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ *rglenp = len;
+ trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
+ *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this rtgroup extent as possible.
+ * The number of blocks disposed of will be returned in @rglenp.
+ */
+STATIC int
+xreap_rgextent_iter(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_extlen_t *rglenp,
+ bool crosslinked)
+{
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rtblock_t rtbno;
+ int error;
+
+ /*
+ * The only caller so far is CoW fork repair, so we only know how to
+ * unlink or free CoW staging extents. Here we don't have to worry
+ * about invalidating buffers!
+ */
+ if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
+ return -EFSCORRUPTED;
+ }
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the forward and reverse mapping and move on.
+ */
+ if (crosslinked) {
+ trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
+ *rglenp);
+
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ rs->deferred++;
+ return 0;
+ }
+
+ trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
+
+ /*
+ * The CoW staging extent is not crosslinked. Use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
+ rs->resv,
+ XFS_FREE_EXTENT_REALTIME |
+ XFS_FREE_EXTENT_SKIP_DISCARD);
+ if (error)
+ return error;
+
+ rs->deferred++;
+ return 0;
+}
+
+#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
+
+/*
+ * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * be aligned to a realtime extent.
+ */
+STATIC int
+xreap_rtmeta_extent(
+ uint64_t rtbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
+ xfs_rgblock_t rgbno_next = rgbno + len;
+ int error = 0;
+
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sr.rtg);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
+ if (!sc->sr.rtg)
+ return -EFSCORRUPTED;
+
+ xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+
+ while (rgbno < rgbno_next) {
+ xfs_extlen_t rglen;
+ bool crosslinked;
+
+ error = xreap_rgextent_select(rs, rgbno, rgbno_next,
+ &crosslinked, &rglen);
+ if (error)
+ goto out_unlock;
+
+ error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
+ if (error)
+ goto out_unlock;
+
+ if (xreap_want_defer_finish(rs)) {
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out_unlock;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out_unlock;
+ xreap_reset(rs);
+ }
+
+ rgbno += rglen;
+ }
+
+out_unlock:
+ xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+ xfs_rtgroup_put(sc->sr.rtg);
+ sc->sr.rtg = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every rt metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_rtblocks(
+ struct xfs_scrub *sc,
+ struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
/*
* Dispose of every block of an old metadata btree that used to be rooted in a
* metadata directory file.
@@ -770,7 +995,8 @@ xreap_bmapi_select(
}
imap->br_blockcount = len;
- trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
+ trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
out_cur:
xfs_btree_del_cursor(cur, error);
return error;
@@ -909,7 +1135,8 @@ xreap_bmapi_binval(
}
out:
- trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
+ trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
+ imap->br_blockcount);
return 0;
}
@@ -936,7 +1163,7 @@ xrep_reap_bmapi_iter(
* anybody else who thinks they own the block, even though that
* runs the risk of stale buffer warnings in the future.
*/
- trace_xreap_dispose_unmap_extent(sc->sa.pag,
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
imap->br_blockcount);
@@ -959,7 +1186,7 @@ xrep_reap_bmapi_iter(
* by a block starting before the first block of the extent but overlap
* anyway.
*/
- trace_xreap_dispose_free_extent(sc->sa.pag,
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
imap->br_blockcount);
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 70e5e6bbb8d3..4c8f62701fb3 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -17,6 +17,13 @@ int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
struct xfsb_bitmap *bitmap);
+#ifdef CONFIG_XFS_RT
+int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
+#else
+# define xrep_reap_rtblocks(...) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
/* Buffer cache scan context. */
struct xrep_bufscan {
/* Disk address for the buffers we want to scan. */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 1c5e45cc6419..d46528023015 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -421,7 +421,7 @@ xchk_refcount_mergeable(
if (r1->rc_refcount != r2->rc_refcount)
return false;
if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
- MAXREFCEXTLEN)
+ XFS_REFC_LEN_MAX)
return false;
return true;
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index 4e572b81c986..9c8cb5332da0 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -183,13 +183,13 @@ xrep_refc_stash(
if (xchk_should_terminate(sc, &error))
return error;
- irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
error = xrep_refc_check_ext(rr->sc, &irec);
if (error)
return error;
- trace_xrep_refc_found(sc->sa.pag, &irec);
+ trace_xrep_refc_found(pag_group(sc->sa.pag), &irec);
return xfarray_append(rr->refcount_records, &irec);
}
@@ -422,7 +422,7 @@ xrep_refc_find_refcounts(
/*
* Set up a bag to store all the rmap records that we're tracking to
* generate a reference count record. If the size of the bag exceeds
- * MAXREFCOUNT, we clamp rc_refcount.
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
*/
error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 61e414c81253..3b5288d3ef4e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -42,6 +42,7 @@
#include "xfs_rtgroup.h"
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1009,6 +1010,11 @@ xrep_rtgroup_btcur_init(
(sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
xfs_has_rtrmapbt(mp))
sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
+ (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
+ xfs_has_rtreflink(mp))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
}
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ac5962732d26..823c00d1a502 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -50,7 +50,9 @@ xrep_trans_commit(
struct xbitmap;
struct xagb_bitmap;
+struct xrgb_bitmap;
struct xfsb_bitmap;
+struct xrtb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
@@ -98,6 +100,7 @@ int xrep_setup_nlinks(struct xfs_scrub *sc);
int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
int xrep_setup_dirtree(struct xfs_scrub *sc);
int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
+int xrep_setup_rtrefcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -157,11 +160,13 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
int xrep_rtsummary(struct xfs_scrub *sc);
int xrep_rgsuperblock(struct xfs_scrub *sc);
int xrep_rtrmapbt(struct xfs_scrub *sc);
+int xrep_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
# define xrep_rtsummary xrep_notsupported
# define xrep_rgsuperblock xrep_notsupported
# define xrep_rtrmapbt xrep_notsupported
+# define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
@@ -235,6 +240,7 @@ xrep_setup_nothing(
#define xrep_setup_dirtree xrep_setup_nothing
#define xrep_setup_metapath xrep_setup_nothing
#define xrep_setup_rtrmapbt xrep_setup_nothing
+#define xrep_setup_rtrefcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -273,6 +279,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
#define xrep_metapath xrep_notsupported
#define xrep_rgsuperblock xrep_notsupported
#define xrep_rtrmapbt xrep_notsupported
+#define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h
new file mode 100644
index 000000000000..4c3126b66dcb
--- /dev/null
+++ b/fs/xfs/scrub/rgb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RGB_BITMAP_H__
+#define __XFS_SCRUB_RGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rgblock_t */
+
+struct xrgb_bitmap {
+ struct xbitmap32 rgbitmap;
+};
+
+static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->rgbitmap);
+}
+
+static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->rgbitmap);
+}
+
+static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap,
+ xfs_rgblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->rgbitmap, start, len);
+}
+
+static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->rgbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index c2c7b76cc25a..f5f73078ffe2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -33,6 +33,7 @@
#include "xfs_ag.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -519,6 +520,9 @@ xrep_rmap_scan_meta_btree(
case XFS_METAFILE_RTRMAP:
type = XFS_RTGI_RMAP;
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ type = XFS_RTGI_REFCOUNT;
+ break;
default:
ASSERT(0);
return -EFSCORRUPTED;
@@ -545,6 +549,9 @@ found:
case XFS_METAFILE_RTRMAP:
cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
+ break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h
new file mode 100644
index 000000000000..1313ef605511
--- /dev/null
+++ b/fs/xfs/scrub/rtb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTB_BITMAP_H__
+#define __XFS_SCRUB_RTB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rtblock_t */
+
+struct xrtb_bitmap {
+ struct xbitmap64 rtbitmap;
+};
+
+static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->rtbitmap);
+}
+
+static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->rtbitmap);
+}
+
+static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap,
+ xfs_rtblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->rtbitmap, start, len);
+}
+
+static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->rtbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 28c90a31f4c3..e8c776a34c1d 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -105,6 +105,8 @@ xchk_rtbitmap_xref(
return;
xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_shared(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount);
if (rtb->next_free_rgbno < rgbno)
xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index c6e33834c5ae..203a1a97c502 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -23,6 +23,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_extent_busy.h"
+#include "xfs_refcount.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -183,7 +184,8 @@ xrep_rtbitmap_mark_free(
xfs_rgblock_t rgbno)
{
struct xfs_mount *mp = rtb->sc->mp;
- struct xfs_rtgroup *rtg = rtb->sc->sr.rtg;
+ struct xchk_rt *sr = &rtb->sc->sr;
+ struct xfs_rtgroup *rtg = sr->rtg;
xfs_rtxnum_t startrtx;
xfs_rtxnum_t nextrtx;
xrep_wordoff_t wordoff, nextwordoff;
@@ -191,6 +193,7 @@ xrep_rtbitmap_mark_free(
unsigned int bufwsize;
xfs_extlen_t mod;
xfs_rtword_t mask;
+ enum xbtree_recpacking outcome;
int error;
if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno))
@@ -210,6 +213,25 @@ xrep_rtbitmap_mark_free(
if (mod != mp->m_sb.sb_rextsize - 1)
return -EFSCORRUPTED;
+ /* Must not be shared or CoW staging. */
+ if (sr->refc_cur) {
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_COW, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1);
/* Set bits as needed to round startrtx up to the nearest word. */
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
new file mode 100644
index 000000000000..4c5dffc73641
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/repair.h"
+
+/* Set us up with the realtime refcount metadata locked. */
+int
+xchk_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtrefcountbt(sc);
+ if (error)
+ return error;
+ }
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+
+ error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg));
+ if (error)
+ return error;
+
+ return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely. The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ * a. If a given rmap completely overlaps, mark it as seen.
+ * b. Otherwise, record the fragment (in agbno order) for later
+ * processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks. Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr. Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ * should start at or before the start of the extent.
+ * Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ * a. Find the shortest fragments in the set and remove them.
+ * b. Note the block number of the end of these fragments.
+ * c. Pull the same number of fragments from the list. All of these
+ * fragments should start at the block number recorded in the
+ * previous step.
+ * d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ * and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true. If not, the refcount is incorrect.
+ */
+struct xchk_rtrefcnt_frag {
+ struct list_head list;
+ struct xfs_rmap_irec rm;
+};
+
+struct xchk_rtrefcnt_check {
+ struct xfs_scrub *sc;
+ struct list_head fragments;
+
+ /* refcount extent we're examining */
+ xfs_rgblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+
+ /* number of owners seen */
+ xfs_nlink_t seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the rtrefcountbt says we should have.
+ */
+STATIC int
+xchk_rtrefcountbt_rmap_check(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xchk_rtrefcnt_check *refchk = priv;
+ struct xchk_rtrefcnt_frag *frag;
+ xfs_rgblock_t rm_last;
+ xfs_rgblock_t rc_last;
+ int error = 0;
+
+ if (xchk_should_terminate(refchk->sc, &error))
+ return error;
+
+ rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+ rc_last = refchk->bno + refchk->len - 1;
+
+ /* Confirm that a single-owner refc extent is a CoW stage. */
+ if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+ xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+ return 0;
+ }
+
+ if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+ /*
+ * The rmap overlaps the refcount record, so we can confirm
+ * one refcount owner seen.
+ */
+ refchk->seen++;
+ } else {
+ /*
+ * This rmap covers only part of the refcount record, so
+ * save the fragment for later processing. If the rmapbt
+ * is healthy each rmap_irec we see will be in agbno order
+ * so we don't need insertion sort here.
+ */
+ frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag),
+ XCHK_GFP_FLAGS);
+ if (!frag)
+ return -ENOMEM;
+ memcpy(&frag->rm, rec, sizeof(frag->rm));
+ list_add_tail(&frag->list, &refchk->fragments);
+ }
+
+ return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount. If this ever deviates from
+ * what we expect (which is the rtrefcountbt's refcount minus the
+ * number of extents that totally covered the rtrefcountbt extent),
+ * we have a rtrefcountbt error.
+ */
+STATIC void
+xchk_rtrefcountbt_process_rmap_fragments(
+ struct xchk_rtrefcnt_check *refchk)
+{
+ struct list_head worklist;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ xfs_rgblock_t bno;
+ xfs_rgblock_t rbno;
+ xfs_rgblock_t next_rbno;
+ xfs_nlink_t nr;
+ xfs_nlink_t target_nr;
+
+ target_nr = refchk->refcount - refchk->seen;
+ if (target_nr == 0)
+ return;
+
+ /*
+ * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+ * references we haven't found yet. Pull that many off the
+ * fragment list and figure out where the smallest rmap ends
+ * (and therefore the next rmap should start). All the rmaps
+ * we pull off should start at or before the beginning of the
+ * refcount record's range.
+ */
+ INIT_LIST_HEAD(&worklist);
+ rbno = NULLRGBLOCK;
+
+ /* Make sure the fragments actually /are/ in bno order. */
+ bno = 0;
+ list_for_each_entry(frag, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock < bno)
+ goto done;
+ bno = frag->rm.rm_startblock;
+ }
+
+ /*
+ * Find all the rmaps that start at or before the refc extent,
+ * and put them on the worklist.
+ */
+ nr = 0;
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+ break;
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno < rbno)
+ rbno = bno;
+ list_move_tail(&frag->list, &worklist);
+ nr++;
+ }
+
+ /*
+ * We should have found exactly $target_nr rmap fragments starting
+ * at or before the refcount extent.
+ */
+ if (nr != target_nr)
+ goto done;
+
+ while (!list_empty(&refchk->fragments)) {
+ /* Discard any fragments ending at rbno from the worklist. */
+ nr = 0;
+ next_rbno = NULLRGBLOCK;
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno != rbno) {
+ if (bno < next_rbno)
+ next_rbno = bno;
+ continue;
+ }
+ list_del(&frag->list);
+ kfree(frag);
+ nr++;
+ }
+
+ /* Try to add nr rmaps starting at rbno to the worklist. */
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (frag->rm.rm_startblock != rbno)
+ goto done;
+ list_move_tail(&frag->list, &worklist);
+ if (next_rbno > bno)
+ next_rbno = bno;
+ nr--;
+ if (nr == 0)
+ break;
+ }
+
+ /*
+ * If we get here and nr > 0, this means that we added fewer
+ * items to the worklist than we discarded because the fragment
+ * list ran out of items. Therefore, we cannot maintain the
+ * required refcount. Something is wrong, so we're done.
+ */
+ if (nr)
+ goto done;
+
+ rbno = next_rbno;
+ }
+
+ /*
+ * Make sure the last extent we processed ends at or beyond
+ * the end of the refcount extent.
+ */
+ if (rbno < refchk->bno + refchk->len)
+ goto done;
+
+ /* Actually record us having seen the remaining refcount. */
+ refchk->seen = refchk->refcount;
+done:
+ /* Delete fragments and work list. */
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_rtrefcountbt_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ struct xchk_rtrefcnt_check refchk = {
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
+ .seen = 0,
+ };
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Cross-reference with the rmapbt to confirm the refcount. */
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = irec->rc_startblock;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+ INIT_LIST_HEAD(&refchk.fragments);
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check, &refchk);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ goto out_free;
+
+ xchk_rtrefcountbt_process_rmap_fragments(&refchk);
+ if (irec->rc_refcount != refchk.seen)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+
+out_free:
+ list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rtrefcountbt_xref(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc,
+ xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock),
+ irec->rc_blockcount);
+ xchk_rtrefcountbt_xref_rmap(sc, irec);
+}
+
+struct xchk_rtrefcbt_records {
+ /* Previous refcount record. */
+ struct xfs_refcount_irec prev_rec;
+
+ /* The next rtgroup block where we aren't expecting shared extents. */
+ xfs_rgblock_t next_unshared_rgbno;
+
+ /* Number of CoW blocks we expect. */
+ xfs_extlen_t cow_blocks;
+
+ /* Was the last record a shared or CoW staging extent? */
+ enum xfs_refc_domain prev_domain;
+};
+
+static inline bool
+xchk_rtrefcount_mergeable(
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *r2)
+{
+ const struct xfs_refcount_irec *r1 = &rrc->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (r1->rc_blockcount > 0)
+ return false;
+
+ if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+ return false;
+ if (r1->rc_refcount != r2->rc_refcount)
+ return false;
+ if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+ XFS_REFC_LEN_MAX)
+ return false;
+
+ return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrefcountbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rtrefcount_mergeable(rrc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
+STATIC int
+xchk_rtrefcountbt_rmap_check_gap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ xfs_rgblock_t *next_bno = priv;
+
+ if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno)
+ return -ECANCELED;
+
+ *next_bno = rec->rm_startblock + rec->rm_blockcount;
+ return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_rtrefcountbt_xref_gaps(
+ struct xfs_scrub *sc,
+ struct xchk_rtrefcbt_records *rrc,
+ xfs_rtblock_t bno)
+{
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ xfs_rgblock_t next_bno = NULLRGBLOCK;
+ int error;
+
+ if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur ||
+ xchk_skip_xref(sc->sm))
+ return;
+
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = rrc->next_unshared_rgbno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno - 1;
+
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check_gap, &next_bno);
+ if (error == -ECANCELED)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ else
+ xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur);
+}
+
+/* Scrub a rtrefcountbt record. */
+STATIC int
+xchk_rtrefcountbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xchk_rtrefcbt_records *rrc = bs->private;
+ struct xfs_refcount_irec irec;
+ u32 mod;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) !=
+ NULL) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* We can only share full rt extents. */
+ mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ rrc->cow_blocks += irec.rc_blockcount;
+
+ /* Shared records always come before CoW records. */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+ rrc->prev_domain == XFS_REFC_DOMAIN_COW)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ rrc->prev_domain = irec.rc_domain;
+
+ xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
+ xchk_rtrefcountbt_xref(bs->sc, &irec);
+
+ /*
+ * If this is a record for a shared extent, check that all blocks
+ * between the previous record and this one have at most one reverse
+ * mapping.
+ */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+ xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+ rrc->next_unshared_rgbno = irec.rc_startblock +
+ irec.rc_blockcount;
+ }
+
+ return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *btree_oinfo,
+ xfs_extlen_t cow_blocks)
+{
+ xfs_filblks_t refcbt_blocks = 0;
+ xfs_filblks_t blocks;
+ int error;
+
+ if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Check that we saw as many refcbt blocks as the rmap knows about. */
+ error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks);
+ if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error))
+ return;
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo,
+ &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != refcbt_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+ /* Check that we saw as many cow blocks as the rmap knows about. */
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur,
+ &XFS_RMAP_OINFO_COW, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (blocks != cow_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xfs_owner_info btree_oinfo;
+ struct xchk_rtrefcbt_records rrc = {
+ .cow_blocks = 0,
+ .next_unshared_rgbno = 0,
+ .prev_domain = XFS_REFC_DOMAIN_SHARED,
+ };
+ int error;
+
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino,
+ XFS_DATA_FORK);
+ error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec,
+ &btree_oinfo, &rrc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ /*
+ * Check that all blocks between the last refcount > 1 record and the
+ * end of the rt volume have at most one reverse mapping.
+ */
+ xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks);
+
+ xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
+
+ return 0;
+}
+
+/* xref check that a cow staging extent is marked in the rtrefcountbt. */
+void
+xchk_xref_is_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_refcount_irec rc;
+ int has_refcount;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Find the CoW staging extent. */
+ error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+ /* Must be at least as long as what was passed in */
+ if (rc.rc_blockcount < len)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared. Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_rt_shared(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
new file mode 100644
index 000000000000..257cfb24beb4
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -0,0 +1,783 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * rt refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the rt refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_rtrefc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xfsb_bitmap old_rtrefcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the rt refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_filblks_t btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_rtrefc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ xfs_rgblock_t last;
+
+ if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0)
+ return -EFSCORRUPTED;
+
+ last = rec->rc_startblock + rec->rc_blockcount - 1;
+ if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space or misaligned. */
+ return xrep_require_rtext_inuse(sc, rec->rc_startblock,
+ rec->rc_blockcount);
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_rtrefc_stash(
+ struct xrep_rtrefc *rr,
+ enum xfs_refc_domain domain,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = bno,
+ .rc_blockcount = len,
+ .rc_refcount = refcount,
+ .rc_domain = domain,
+ };
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
+
+ error = xrep_rtrefc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrefc_stash_cow(
+ struct xrep_rtrefc *rr,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_rtrefc_rmap_shareable(
+ const struct xfs_rmap_irec *rmap)
+{
+ /* rt metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+
+ return true;
+}
+
+/* Grab the next (abbreviated) rmap record from the rmapbt. */
+STATIC int
+xrep_rtrefc_walk_rmaps(
+ struct xrep_rtrefc *rr,
+ struct xfs_rmap_irec *rmap,
+ bool *have_rec)
+{
+ struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
+ if (error)
+ return error;
+ } else if (xfs_is_sb_inum(mp, rmap->rm_owner) ||
+ (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+ XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ } while (!xrep_rtrefc_rmap_shareable(rmap));
+
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_rtrefc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/*
+ * Compare two refcount records. We want to sort in order of increasing block
+ * number.
+ */
+static int
+xrep_rtrefc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_rtrefc_encode_startblock(ap);
+ sb = xrep_rtrefc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_rtrefc_sort_records(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_rgblock_t next_rgbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_rgbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_rgbno)
+ return -EFSCORRUPTED;
+
+ next_rgbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+/* Record extents that belong to the realtime refcount inode. */
+STATIC int
+xrep_rtrefc_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rr->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_check_ino_btree_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+ rec->rm_blockcount);
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_rtrefc_push_rmaps_at(
+ struct xrep_rtrefc *rr,
+ struct rcbag *rcstack,
+ xfs_rgblock_t bno,
+ struct xfs_rmap_irec *rmap,
+ bool *have)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
+ if (error)
+ return error;
+
+ error = xrep_rtrefc_walk_rmaps(rr, rmap, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sr.rmap_cur);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings for the realtime refcount btree. */
+STATIC int
+xrep_rtrefc_scan_ag(
+ struct xrep_rtrefc *rr,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_rtrefc_find_refcounts(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct rcbag *rcstack;
+ struct xfs_perag *pag = NULL;
+ uint64_t old_stack_height;
+ xfs_rgblock_t sbno;
+ xfs_rgblock_t cbno;
+ xfs_rgblock_t nbno;
+ bool have;
+ int error;
+
+ /* Scan for old rtrefc btree blocks. */
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
+ error = xrep_rtrefc_scan_ag(rr, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+
+ /*
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If this exceeds
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
+ */
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
+ if (error)
+ goto out_cur;
+
+ /* Start the rtrmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sr.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sr.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_height = rcbag_count(rcstack);
+
+ /* While stack isn't empty... */
+ while (rcbag_count(rcstack) > 0) {
+ /* Pop all rmaps that end at nbno */
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
+ error = xrep_rtrefc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_height);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (rcbag_count(rcstack) == 0)
+ break;
+ old_stack_height = rcbag_count(rcstack);
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+ rcbag_free(&rcstack);
+out_cur:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ &cur->bc_rec.rc);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+ nr_this_level);
+}
+
+/*
+ * Use the collected refcount information to stage a new rt refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_rtrefc_build_new_tree(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_btree_cur *refc_cur;
+ int error;
+
+ error = xrep_rtrefc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the realtime refcount inode.
+ */
+ error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+ if (error)
+ return error;
+
+ rr->new_btree.bload.get_records = xrep_rtrefc_get_records;
+ rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block;
+ rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size;
+
+ refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg);
+ xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake);
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire
+ * rtrefcountbt from the number of extents we found, and pump up our
+ * transaction to have sufficient block reservation. We're allowed
+ * to exceed quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg),
+ rr->new_btree.bload.nr_blocks, 0, true);
+ if (error)
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /* Add all observed refcount records. */
+ rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new rtrefc btree in the inode. After this point the old
+ * btree is no longer accessible, the new tree is live, and we can
+ * delete the cursor.
+ */
+ xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp);
+ xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_rtrefc_remove_old_tree(
+ struct xrep_rtrefc *rr)
+{
+ int error;
+
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
+ if (error)
+ return error;
+
+ /*
+ * Ensure the proper reservation for the rtrefcount inode so that we
+ * don't fail to expand the btree.
+ */
+ return xrep_reset_metafile_resv(rr->sc);
+}
+
+/* Rebuild the rt refcount btree. */
+int
+xrep_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrefc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rtrmapbt(mp))
+ return -EOPNOTSUPP;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per rt extent. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_rextents,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks);
+ error = xrep_rtrefc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the refcount information. */
+ error = xrep_rtrefc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_rtrefc_remove_old_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index 300a1e85b3d6..12989fe80e8b 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -22,6 +22,7 @@
#include "xfs_rtalloc.h"
#include "xfs_rtgroup.h"
#include "xfs_metafile.h"
+#include "xfs_refcount.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -77,6 +78,18 @@ struct xchk_rtrmap {
struct xfs_rmap_irec prev_rec;
};
+static inline bool
+xchk_rtrmapbt_is_shareable(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *irec)
+{
+ if (!xfs_has_rtreflink(sc->mp))
+ return false;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+ return true;
+}
+
/* Flag failures for records that overlap but cannot. */
STATIC void
xchk_rtrmapbt_check_overlapping(
@@ -98,7 +111,10 @@ xchk_rtrmapbt_check_overlapping(
if (pnext <= irec->rm_startblock)
goto set_prev;
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ /* Overlap is only allowed if both records are data fork mappings. */
+ if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+ !xchk_rtrmapbt_is_shareable(bs->sc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
/* Save whichever rmap record extends furthest. */
inext = irec->rm_startblock + irec->rm_blockcount;
@@ -149,6 +165,37 @@ xchk_rtrmapbt_check_mergeable(
memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
}
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rtrmapbt_xref_rtrefc(
+ struct xfs_scrub *sc,
+ struct xfs_rmap_irec *irec)
+{
+ xfs_rgblock_t fbno;
+ xfs_extlen_t flen;
+ bool is_inode;
+ bool is_bmbt;
+ bool is_attr;
+ bool is_unwritten;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+ is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+ /* If this is shared, must be a data fork extent. */
+ error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock,
+ irec->rm_blockcount, &fbno, &flen, false);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten))
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
/* Cross-reference with other metadata. */
STATIC void
xchk_rtrmapbt_xref(
@@ -161,6 +208,11 @@ xchk_rtrmapbt_xref(
xchk_xref_is_used_rt_space(sc,
xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock),
irec->rm_blockcount);
+ if (irec->rm_owner == XFS_RMAP_OWN_COW)
+ xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+ irec->rm_blockcount);
+ else
+ xchk_rtrmapbt_xref_rtrefc(sc, irec);
}
/* Scrub a realtime rmapbt record. */
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index 49de8bc2dd17..f2fdd7a9fc24 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -30,6 +30,7 @@
#include "xfs_rtalloc.h"
#include "xfs_ag.h"
#include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -38,6 +39,7 @@
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rgb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
@@ -423,6 +425,100 @@ xrep_rtrmap_scan_ag(
return error;
}
+struct xrep_rtrmap_stash_run {
+ struct xrep_rtrmap *rr;
+ uint64_t owner;
+};
+
+static int
+xrep_rtrmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rtrmap_stash_run *rsr = priv;
+ struct xrep_rtrmap *rr = rsr->rr;
+ xfs_rgblock_t rgbno = start;
+
+ return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rtrmap_stash_bitmap(
+ struct xrep_rtrmap *rr,
+ struct xrgb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rtrmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ };
+
+ return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xrgb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xrgb_bitmap_set(bitmap, irec->rc_startblock,
+ irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rtrmap_find_refcount_rmaps(
+ struct xrep_rtrmap *rr)
+{
+ struct xrgb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_rtreflink(sc->mp))
+ return 0;
+
+ xrgb_bitmap_init(&cow_blocks);
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
+ xrep_rtrmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xrgb_bitmap_destroy(&cow_blocks);
+ return error;
+}
+
/* Count and check all collected records. */
STATIC int
xrep_rtrmap_check_record(
@@ -460,6 +556,13 @@ xrep_rtrmap_find_rmaps(
return error;
}
+ /* Find CoW staging extents. */
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+ error = xrep_rtrmap_find_refcount_rmaps(rr);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ if (error)
+ return error;
+
/*
* Set up for a potentially lengthy filesystem scan by reducing our
* transaction resource usage for the duration. Specifically:
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 16da054b2eb0..7567dd5cad14 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -467,6 +467,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.has = xfs_has_rtrmapbt,
.repair = xrep_rtrmapbt,
},
+ [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rtrefcountbt,
+ .scrub = xchk_rtrefcountbt,
+ .has = xfs_has_rtreflink,
+ .repair = xrep_rtrefcountbt,
+ },
};
static int
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index cba4e89a3a62..a1086f1f06d0 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -129,6 +129,7 @@ struct xchk_rt {
/* rtgroup btrees */
struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
};
struct xfs_scrub {
@@ -284,11 +285,13 @@ int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
int xchk_rgsuperblock(struct xfs_scrub *sc);
int xchk_rtrmapbt(struct xfs_scrub *sc);
+int xchk_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xchk_rtbitmap xchk_nothing
# define xchk_rtsummary xchk_nothing
# define xchk_rgsuperblock xchk_nothing
# define xchk_rtrmapbt xchk_nothing
+# define xchk_rtrefcountbt xchk_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
@@ -328,11 +331,20 @@ void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
xfs_extlen_t len);
void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
#else
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
+# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0)
#endif
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index eb6bb170c902..f8a37ea97791 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -83,6 +83,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_METAPATH] = "metapath",
[XFS_SCRUB_TYPE_RGSUPER] = "rgsuper",
[XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt",
+ [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index fb86b746bc17..d7c4ced47c15 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -77,6 +77,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -111,7 +112,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
{ XFS_SCRUB_TYPE_BARRIER, "barrier" }, \
{ XFS_SCRUB_TYPE_METAPATH, "metapath" }, \
{ XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \
- { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }
+ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \
+ { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -1962,32 +1964,36 @@ DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len),
- TP_ARGS(pag, agbno, len),
+ TP_ARGS(xg, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len)
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
xfs_extlen_t len), \
- TP_ARGS(pag, agbno, len))
+ TP_ARGS(xg, agbno, len))
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
@@ -1995,35 +2001,39 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len, bool crosslinked),
- TP_ARGS(pag, agbno, len, crosslinked),
+ TP_ARGS(xg, agbno, len, crosslinked),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
__field(bool, crosslinked)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->crosslinked = crosslinked;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len,
__entry->crosslinked ? 1 : 0)
);
#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
DEFINE_EVENT(xrep_reap_find_class, name, \
- TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
xfs_extlen_t len, bool crosslinked), \
- TP_ARGS(pag, agbno, len, crosslinked))
+ TP_ARGS(xg, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
@@ -2114,29 +2124,33 @@ TRACE_EVENT(xrep_ibt_found,
)
TRACE_EVENT(xrep_refc_found,
- TP_PROTO(const struct xfs_perag *pag,
+ TP_PROTO(const struct xfs_group *xg,
const struct xfs_refcount_irec *rec),
- TP_ARGS(pag, rec),
+ TP_ARGS(xg, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
+ __field(enum xfs_group_type, type)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
- __entry->dev = pag_mount(pag)->m_super->s_dev;
- __entry->agno = pag_agno(pag);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->agno = xg->xg_gno;
+ __entry->type = xg->xg_type;
__entry->domain = rec->rc_domain;
__entry->startblock = rec->rc_startblock;
__entry->blockcount = rec->rc_blockcount;
__entry->refcount = rec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 4f2e4ea29e1f..b05d5b81f642 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -271,6 +271,9 @@ xlog_recover_validate_buf_type(
case XFS_REFC_CRC_MAGIC:
bp->b_ops = &xfs_refcountbt_buf_ops;
break;
+ case XFS_RTREFC_CRC_MAGIC:
+ bp->b_ops = &xfs_rtrefcountbt_buf_ops;
+ break;
default:
warnmsg = "Bad btree block magic!";
break;
@@ -859,6 +862,7 @@ xlog_recover_get_buf_lsn(
break;
}
case XFS_RTRMAP_CRC_MAGIC:
+ case XFS_RTREFC_CRC_MAGIC:
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC: {
struct xfs_btree_block *btb = blk;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3e3ef16f65a3..1dbd2d75f7ae 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -27,6 +27,7 @@
#include "xfs_ag.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/* Convert an xfs_fsmap to an fsmap. */
static void
@@ -212,21 +213,20 @@ xfs_getfsmap_is_shared(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_btree_cur *cur;
xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_extlen_t flen = 0;
int error;
*stat = false;
- if (!xfs_has_reflink(mp))
- return 0;
- /* rt files will have no perag structure */
- if (!info->group)
+ if (!xfs_has_reflink(mp) || !info->group)
return 0;
- /* Are there any shared blocks here? */
- flen = 0;
- cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
- to_perag(info->group));
+ if (info->group->xg_type == XG_TYPE_RTG)
+ cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group));
+ else
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ to_perag(info->group));
+ /* Are there any shared blocks here? */
error = xfs_refcount_find_shared(cur, frec->rec_key,
XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen,
false);
@@ -863,7 +863,7 @@ xfs_getfsmap_rtdev_rmapbt_query(
struct xfs_rtgroup *rtg = to_rtg(info->group);
/* Query the rtrmapbt */
- xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
*curpp = xfs_rtrmapbt_init_cursor(tp, rtg);
return xfs_rmap_query_range(*curpp, &info->low, &info->high,
xfs_getfsmap_rtdev_rmapbt_helper, info);
@@ -950,7 +950,8 @@ xfs_getfsmap_rtdev_rmapbt(
if (bt_cur) {
xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
- XFS_RTGLOCK_RMAP);
+ XFS_RTGLOCK_RMAP |
+ XFS_RTGLOCK_REFCOUNT);
xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
bt_cur = NULL;
}
@@ -988,7 +989,7 @@ xfs_getfsmap_rtdev_rmapbt(
if (bt_cur) {
xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group),
- XFS_RTGLOCK_RMAP);
+ XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
XFS_BTREE_NOERROR);
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9df5a09c0acd..455298503d01 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -23,6 +23,7 @@
#include "xfs_trace.h"
#include "xfs_rtalloc.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -231,6 +232,7 @@ xfs_growfs_data_private(
/* Compute new maxlevels for rt btrees. */
xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
}
return error;
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index d438c3c001c8..7c541fb373d5 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -448,6 +448,7 @@ static const struct ioctl_sick_map rtgroup_map[] = {
{ XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP },
{ XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY },
{ XFS_SICK_RG_RMAPBT, XFS_RTGROUP_GEOM_SICK_RMAPBT },
+ { XFS_SICK_RG_REFCNTBT, XFS_RTGROUP_GEOM_SICK_REFCNTBT },
};
/* Fill out rtgroup geometry health info. */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a174f64b8bb2..70283c6419fd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -157,6 +157,20 @@ xfs_inode_item_precommit(
if (flags & XFS_ILOG_IVERSION)
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+ /*
+ * Inode verifiers do not check that the CoW extent size hint is an
+ * integer multiple of the rt extent size on a directory with both
+ * rtinherit and cowextsize flags set. If we're logging a directory
+ * that is misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 5de1d3563b76..f3bfb814378c 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -23,6 +23,7 @@
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
STATIC void
xlog_recover_inode_ra_pass2(
@@ -286,6 +287,9 @@ xlog_recover_inode_dbroot(
case XFS_METAFILE_RTRMAP:
xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
return 0;
+ case XFS_METAFILE_RTREFCOUNT:
+ xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize);
+ return 0;
default:
ASSERT(0);
return -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0789c18aaa18..726282e74d54 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -469,8 +469,21 @@ xfs_fill_fsxattr(
}
}
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ /*
+ * Don't let a misaligned CoW extent size hint on a directory
+ * escape to userspace if it won't pass the setattr checks
+ * later.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) {
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+ fa->fsx_cowextsize = 0;
+ } else {
+ fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+ }
+ }
+
fa->fsx_projid = ip->i_projid;
if (ifp && !xfs_need_iread_extents(ifp))
fa->fsx_nextents = xfs_iext_count(ifp);
@@ -541,10 +554,6 @@ xfs_ioctl_setattr_xflags(
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
xfs_extlen_to_rtxmod(mp, ip->i_extsize))
return -EINVAL;
-
- /* Clear reflink if we are actually able to set the rt flag. */
- if (xfs_is_reflink_inode(ip))
- ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
}
/* diflags2 only valid for v3 inodes. */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5c95c97519c7..b3c27dbccce8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1822,6 +1822,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_rtefd_item_ops,
&xlog_rtrui_item_ops,
&xlog_rtrud_item_ops,
+ &xlog_rtcui_item_ops,
+ &xlog_rtcud_item_ops,
};
static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7b7d21b50d54..477c5262cf91 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -38,6 +38,7 @@
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -656,7 +657,8 @@ static inline void
xfs_rtbtree_compute_maxlevels(
struct xfs_mount *mp)
{
- mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels;
+ mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels,
+ mp->m_rtrefc_maxlevels);
}
/*
@@ -729,6 +731,7 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_rtrmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
xfs_agbtree_compute_maxlevels(mp);
xfs_rtbtree_compute_maxlevels(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1bc95fb170db..9a1516080e63 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,11 +162,14 @@ typedef struct xfs_mount {
uint m_rtrmap_mnr[2]; /* min rtrmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
+ uint m_rtrefc_mxr[2]; /* max rtrefc btree records */
+ uint m_rtrefc_mnr[2]; /* min rtrefc btree records */
uint m_alloc_maxlevels; /* max alloc btree levels */
uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_rtrmap_maxlevels; /* max rtrmap btree level */
uint m_refc_maxlevels; /* max refcount btree level */
+ uint m_rtrefc_maxlevels; /* max rtrefc btree level */
unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -408,6 +411,12 @@ static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
xfs_has_rmapbt(mp);
}
+static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+{
+ return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
+ xfs_has_reflink(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index bede1c96c330..fe2d7aab8554 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -23,6 +23,7 @@
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
struct kmem_cache *xfs_cui_cache;
struct kmem_cache *xfs_cud_cache;
@@ -94,8 +95,9 @@ xfs_cui_item_format(
ASSERT(atomic_read(&cuip->cui_next_extent) ==
cuip->cui_format.cui_nextents);
+ ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
- cuip->cui_format.cui_type = XFS_LI_CUI;
+ cuip->cui_format.cui_type = lip->li_type;
cuip->cui_format.cui_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
@@ -138,12 +140,14 @@ xfs_cui_item_release(
STATIC struct xfs_cui_log_item *
xfs_cui_init(
struct xfs_mount *mp,
+ unsigned short item_type,
uint nextents)
-
{
struct xfs_cui_log_item *cuip;
ASSERT(nextents > 0);
+ ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT);
+
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
GFP_KERNEL | __GFP_NOFAIL);
@@ -151,7 +155,7 @@ xfs_cui_init(
cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+ xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
atomic_set(&cuip->cui_next_extent, 0);
@@ -190,7 +194,9 @@ xfs_cud_item_format(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
struct xfs_log_iovec *vecp = NULL;
- cudp->cud_format.cud_type = XFS_LI_CUD;
+ ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT);
+
+ cudp->cud_format.cud_type = lip->li_type;
cudp->cud_format.cud_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
@@ -234,6 +240,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
return list_entry(e, struct xfs_refcount_intent, ri_list);
}
+static inline bool
+xfs_cui_item_isrt(const struct xfs_log_item *lip)
+{
+ ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT);
+
+ return lip->li_type == XFS_LI_CUI_RT;
+}
+
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -282,18 +296,20 @@ xfs_refcount_update_log_item(
}
static struct xfs_log_item *
-xfs_refcount_update_create_intent(
+__xfs_refcount_update_create_intent(
struct xfs_trans *tp,
struct list_head *items,
unsigned int count,
- bool sort)
+ bool sort,
+ unsigned short item_type)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+ struct xfs_cui_log_item *cuip;
struct xfs_refcount_intent *ri;
ASSERT(count > 0);
+ cuip = xfs_cui_init(mp, item_type, count);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -301,6 +317,23 @@ xfs_refcount_update_create_intent(
return &cuip->cui_item;
}
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_refcount_update_create_intent(tp, items, count, sort,
+ XFS_LI_CUI);
+}
+
+static inline unsigned short
+xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip)
+{
+ return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD;
+}
+
/* Get an CUD so we can process all the deferred refcount updates. */
static struct xfs_log_item *
xfs_refcount_update_create_done(
@@ -312,8 +345,8 @@ xfs_refcount_update_create_done(
struct xfs_cud_log_item *cudp;
cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
- &xfs_cud_item_ops);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item,
+ xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops);
cudp->cud_cuip = cuip;
cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
@@ -328,10 +361,20 @@ xfs_refcount_defer_add(
{
struct xfs_mount *mp = tp->t_mountp;
- trace_xfs_refcount_defer(mp, ri);
+ /*
+ * Deferred refcount updates for the realtime and data sections must
+ * use separate transactions to finish deferred work because updates to
+ * realtime metadata files can lock AGFs to allocate btree blocks and
+ * we don't want that mixing with the AGF locks taken to finish data
+ * section updates.
+ */
+ ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock,
+ ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG);
- ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG);
- xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+ trace_xfs_refcount_defer(mp, ri);
+ xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ?
+ &xfs_rtrefcount_update_defer_type :
+ &xfs_refcount_update_defer_type);
}
/* Cancel a deferred refcount update. */
@@ -381,7 +424,7 @@ xfs_refcount_finish_one_cleanup(
return;
agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
- if (error)
+ if (error && agbp)
xfs_trans_brelse(tp, agbp);
}
@@ -397,6 +440,7 @@ xfs_refcount_update_abort_intent(
static inline bool
xfs_cui_validate_phys(
struct xfs_mount *mp,
+ bool isrt,
struct xfs_phys_extent *pmap)
{
if (!xfs_has_reflink(mp))
@@ -415,6 +459,9 @@ xfs_cui_validate_phys(
return false;
}
+ if (isrt)
+ return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len);
+
return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
@@ -422,6 +469,7 @@ static inline void
xfs_cui_recover_work(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp,
+ bool isrt,
struct xfs_phys_extent *pmap)
{
struct xfs_refcount_intent *ri;
@@ -432,7 +480,8 @@ xfs_cui_recover_work(
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock,
- XG_TYPE_AG);
+ isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+ ri->ri_realtime = isrt;
xfs_defer_add_item(dfp, &ri->ri_list);
}
@@ -451,6 +500,7 @@ xfs_refcount_recover_work(
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_mount *mp = lip->li_log->l_mp;
+ bool isrt = xfs_cui_item_isrt(lip);
int i;
int error = 0;
@@ -460,7 +510,7 @@ xfs_refcount_recover_work(
* just toss the CUI.
*/
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- if (!xfs_cui_validate_phys(mp,
+ if (!xfs_cui_validate_phys(mp, isrt,
&cuip->cui_format.cui_extents[i])) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&cuip->cui_format,
@@ -468,7 +518,8 @@ xfs_refcount_recover_work(
return -EFSCORRUPTED;
}
- xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
+ xfs_cui_recover_work(mp, dfp, isrt,
+ &cuip->cui_format.cui_extents[i]);
}
/*
@@ -515,10 +566,13 @@ xfs_refcount_relog_intent(
struct xfs_phys_extent *pmap;
unsigned int count;
+ ASSERT(intent->li_type == XFS_LI_CUI ||
+ intent->li_type == XFS_LI_CUI_RT);
+
count = CUI_ITEM(intent)->cui_format.cui_nextents;
pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- cuip = xfs_cui_init(tp->t_mountp, count);
+ cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count);
memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
@@ -538,6 +592,71 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.relog_intent = xfs_refcount_relog_intent,
};
+#ifdef CONFIG_XFS_RT
+static struct xfs_log_item *
+xfs_rtrefcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_refcount_update_create_intent(tp, items, count, sort,
+ XFS_LI_CUI_RT);
+}
+
+/* Process a deferred realtime refcount update. */
+STATIC int
+xfs_rtrefcount_update_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_refcount_intent *ri = ci_entry(item);
+ int error;
+
+ error = xfs_rtrefcount_finish_one(tp, ri, state);
+
+ /* Did we run out of reservation? Requeue what we didn't finish. */
+ if (!error && ri->ri_blockcount > 0) {
+ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+ ri->ri_type == XFS_REFCOUNT_DECREASE);
+ return -EAGAIN;
+ }
+
+ xfs_refcount_update_cancel_item(item);
+ return error;
+}
+
+/* Clean up after calling xfs_rtrefcount_finish_one. */
+STATIC void
+xfs_rtrefcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ if (rcur)
+ xfs_btree_del_cursor(rcur, error);
+}
+
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+ .name = "rtrefcount",
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rtrefcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_rtrefcount_update_finish_item,
+ .finish_cleanup = xfs_rtrefcount_finish_one_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+ .recover_work = xfs_refcount_recover_work,
+ .relog_intent = xfs_refcount_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = {
+ .name = "rtrefcount",
+};
+#endif /* CONFIG_XFS_RT */
+
STATIC bool
xfs_cui_item_match(
struct xfs_log_item *lip,
@@ -603,7 +722,7 @@ xlog_recover_cui_commit_pass2(
return -EFSCORRUPTED;
}
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
@@ -617,6 +736,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = {
.commit_pass2 = xlog_recover_cui_commit_pass2,
};
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_cui_log_format *cui_formatp;
+ size_t len;
+
+ cui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents);
+ xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+ atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+ xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+ &xfs_rtrefcount_update_defer_type);
+ return 0;
+}
+#else
+STATIC int
+xlog_recover_rtcui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcui_item_ops = {
+ .item_type = XFS_LI_CUI_RT,
+ .commit_pass2 = xlog_recover_rtcui_commit_pass2,
+};
+
/*
* This routine is called when an CUD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding CUI if it
@@ -648,3 +822,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = {
.item_type = XFS_LI_CUD,
.commit_pass2 = xlog_recover_cud_commit_pass2,
};
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtcud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_cud_log_format *cud_formatp;
+
+ cud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_CUI_RT,
+ cud_formatp->cud_cui_id);
+ return 0;
+}
+#else
+# define xlog_recover_rtcud_commit_pass2 xlog_recover_rtcui_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtcud_item_ops = {
+ .item_type = XFS_LI_CUD_RT,
+ .commit_pass2 = xlog_recover_rtcud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b11769c009ef..59f7fc16eb80 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -30,6 +30,10 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_health.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
/*
* Copy on Write of Shared Blocks
@@ -120,38 +124,93 @@
*/
/*
- * Given an AG extent, find the lowest-numbered run of shared blocks
- * within that range and return the range in fbno/flen. If
- * find_end_of_shared is true, return the longest contiguous extent of
- * shared blocks. If there are no shared extents, fbno and flen will
- * be set to NULLAGBLOCK and 0, respectively.
+ * Given a file mapping for the data device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks. If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
*/
static int
xfs_reflink_find_shared(
- struct xfs_perag *pag,
+ struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- xfs_agblock_t *fbno,
- xfs_extlen_t *flen,
+ const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t *shared_offset,
+ xfs_extlen_t *shared_len,
bool find_end_of_shared)
{
struct xfs_buf *agbp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *cur;
int error;
+ xfs_agblock_t orig_bno, found_bno;
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
- return error;
+ goto out;
- cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+ &found_bno, shared_len, find_end_of_shared);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agbp);
- error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
- find_end_of_shared);
+ if (!error && *shared_len)
+ *shared_offset = found_bno - orig_bno;
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+/*
+ * Given a file mapping for the rt device, find the lowest-numbered run of
+ * shared blocks within that mapping and return it in shared_offset/shared_len.
+ * The offset is relative to the start of irec.
+ *
+ * If find_end_of_shared is true, return the longest contiguous extent of shared
+ * blocks. If there are no shared extents, shared_offset and shared_len will be
+ * set to 0;
+ */
+static int
+xfs_reflink_find_rtshared(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t *shared_offset,
+ xfs_extlen_t *shared_len,
+ bool find_end_of_shared)
+{
+ struct xfs_rtgroup *rtg;
+ struct xfs_btree_cur *cur;
+ xfs_rgblock_t orig_bno;
+ xfs_agblock_t found_bno;
+ int error;
+
+ BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
+
+ /*
+ * Note: this uses the not quite correct xfs_agblock_t type because
+ * xfs_refcount_find_shared is shared between the RT and data device
+ * refcount code.
+ */
+ orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock);
+ rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock));
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT);
+ cur = xfs_rtrefcountbt_init_cursor(tp, rtg);
+ error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount,
+ &found_bno, shared_len, find_end_of_shared);
xfs_btree_del_cursor(cur, error);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
+ xfs_rtgroup_put(rtg);
- xfs_trans_brelse(tp, agbp);
+ if (!error && *shared_len)
+ *shared_offset = found_bno - orig_bno;
return error;
}
@@ -172,11 +231,7 @@ xfs_reflink_trim_around_shared(
bool *shared)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_extlen_t shared_offset, shared_len;
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
@@ -187,41 +242,37 @@ xfs_reflink_trim_around_shared(
trace_xfs_reflink_trim_around_shared(ip, irec);
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
- aglen = irec->br_blockcount;
-
- error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
- true);
- xfs_perag_put(pag);
+ if (XFS_IS_REALTIME_INODE(ip))
+ error = xfs_reflink_find_rtshared(mp, NULL, irec,
+ &shared_offset, &shared_len, true);
+ else
+ error = xfs_reflink_find_shared(mp, NULL, irec,
+ &shared_offset, &shared_len, true);
if (error)
return error;
- *shared = false;
- if (fbno == NULLAGBLOCK) {
+ if (!shared_len) {
/* No shared blocks at all. */
- return 0;
- }
-
- if (fbno == agbno) {
+ *shared = false;
+ } else if (!shared_offset) {
/*
- * The start of this extent is shared. Truncate the
- * mapping at the end of the shared region so that a
- * subsequent iteration starts at the start of the
- * unshared region.
+ * The start of this mapping points to shared space. Truncate
+ * the mapping at the end of the shared region so that a
+ * subsequent iteration starts at the start of the unshared
+ * region.
*/
- irec->br_blockcount = flen;
+ irec->br_blockcount = shared_len;
*shared = true;
- return 0;
+ } else {
+ /*
+ * There's a shared region that doesn't start at the beginning
+ * of the mapping. Truncate the mapping at the start of the
+ * shared extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = shared_offset;
+ *shared = false;
}
-
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
return 0;
}
@@ -389,20 +440,26 @@ xfs_reflink_fill_cow_hole(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks;
+ unsigned int dblocks = 0, rblocks = 0;
int nimaps;
int error;
bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
+ } else {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
+ }
xfs_iunlock(ip, *lockmode);
*lockmode = 0;
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
- false, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, false, &tp);
if (error)
return error;
@@ -571,6 +628,7 @@ xfs_reflink_cancel_cow_blocks(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int error = 0;
if (!xfs_inode_has_cow_data(ip))
@@ -598,12 +656,13 @@ xfs_reflink_cancel_cow_blocks(
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
/* Free the CoW orphan record. */
- xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
- del.br_blockcount);
+ xfs_refcount_free_cow_extent(*tpp, isrt,
+ del.br_startblock, del.br_blockcount);
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE, 0);
+ XFS_AG_RESV_NONE,
+ isrt ? XFS_FREE_EXTENT_REALTIME : 0);
if (error)
break;
@@ -687,6 +746,35 @@ out:
return error;
}
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Update quota accounting for a remapping operation. When we're remapping
+ * something from the CoW fork to the data fork, we must update the quota
+ * accounting for delayed allocations. For remapping from the data fork to the
+ * data fork, use regular block accounting.
+ */
+static inline void
+xfs_reflink_update_quota(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ bool is_cow,
+ int64_t blocks)
+{
+ unsigned int qflag;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
+ XFS_TRANS_DQ_RTBCOUNT;
+ } else {
+ qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
+ XFS_TRANS_DQ_BCOUNT;
+ }
+ xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
+}
+#else
+# define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0)
+#endif
+
/*
* Remap part of the CoW fork into the data fork.
*
@@ -710,6 +798,7 @@ xfs_reflink_end_cow_extent(
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
int nmaps;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
@@ -779,9 +868,8 @@ xfs_reflink_end_cow_extent(
* or not), unmap the extent and drop its refcount.
*/
xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
- xfs_refcount_decrease_extent(tp, &data);
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- -data.br_blockcount);
+ xfs_refcount_decrease_extent(tp, isrt, &data);
+ xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
} else if (data.br_startblock == DELAYSTARTBLOCK) {
int done;
@@ -799,14 +887,14 @@ xfs_reflink_end_cow_extent(
}
/* Free the CoW orphan record. */
- xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
+ xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
+ del.br_blockcount);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
/* Charge this new data fork mapping to the on-disk quota. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
- (long)del.br_blockcount);
+ xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
@@ -895,20 +983,29 @@ xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
int error = 0;
if (!xfs_has_reflink(mp))
return 0;
while ((pag = xfs_perag_next(mp, pag))) {
- error = xfs_refcount_recover_cow_leftovers(mp, pag);
+ error = xfs_refcount_recover_cow_leftovers(pag_group(pag));
if (error) {
xfs_perag_rele(pag);
- break;
+ return error;
}
}
- return error;
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg));
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ return 0;
}
/*
@@ -1100,14 +1197,28 @@ out_error:
static int
xfs_reflink_ag_has_free_space(
struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_inode *ip,
+ xfs_fsblock_t fsb)
{
struct xfs_perag *pag;
+ xfs_agnumber_t agno;
int error = 0;
if (!xfs_has_rmapbt(mp))
return 0;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ struct xfs_rtgroup *rtg;
+ xfs_rgnumber_t rgno;
+
+ rgno = xfs_rtb_to_rgno(mp, fsb);
+ rtg = xfs_rtgroup_get(mp, rgno);
+ if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
+ error = -ENOSPC;
+ xfs_rtgroup_put(rtg);
+ return error;
+ }
+ agno = XFS_FSB_TO_AGNO(mp, fsb);
pag = xfs_perag_get(mp, agno);
if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
@@ -1131,10 +1242,11 @@ xfs_reflink_remap_extent(
struct xfs_trans *tp;
xfs_off_t newlen;
int64_t qdelta = 0;
- unsigned int resblks;
+ unsigned int dblocks, rblocks, resblks;
bool quota_reserved = true;
bool smap_real;
bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
int iext_delta = 0;
int nimaps;
int error;
@@ -1161,8 +1273,15 @@ xfs_reflink_remap_extent(
* we're remapping.
*/
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = resblks;
+ rblocks = dmap->br_blockcount;
+ } else {
+ dblocks = resblks + dmap->br_blockcount;
+ rblocks = 0;
+ }
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
- resblks + dmap->br_blockcount, 0, false, &tp);
+ dblocks, rblocks, false, &tp);
if (error == -EDQUOT || error == -ENOSPC) {
quota_reserved = false;
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
@@ -1213,8 +1332,8 @@ xfs_reflink_remap_extent(
/* No reflinking if the AG of the dest mapping is low on space. */
if (dmap_written) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
+ error = xfs_reflink_ag_has_free_space(mp, ip,
+ dmap->br_startblock);
if (error)
goto out_cancel;
}
@@ -1242,8 +1361,15 @@ xfs_reflink_remap_extent(
* done.
*/
if (!quota_reserved && !smap_real && dmap_written) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- dmap->br_blockcount, 0, false);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = 0;
+ rblocks = dmap->br_blockcount;
+ } else {
+ dblocks = dmap->br_blockcount;
+ rblocks = 0;
+ }
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+ false);
if (error)
goto out_cancel;
}
@@ -1264,7 +1390,7 @@ xfs_reflink_remap_extent(
* or not), unmap the extent and drop its refcount.
*/
xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
- xfs_refcount_decrease_extent(tp, &smap);
+ xfs_refcount_decrease_extent(tp, isrt, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
int done;
@@ -1287,12 +1413,12 @@ xfs_reflink_remap_extent(
* its refcount and map it into the file.
*/
if (dmap_written) {
- xfs_refcount_increase_extent(tp, dmap);
+ xfs_refcount_increase_extent(tp, isrt, dmap);
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
qdelta += dmap->br_blockcount;
}
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+ xfs_reflink_update_quota(tp, ip, false, qdelta);
/* Update dest isize if needed. */
newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
@@ -1466,8 +1592,8 @@ xfs_reflink_remap_prep(
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
- /* Don't reflink realtime inodes */
- if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+ /* Can't reflink between data and rt volumes */
+ if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
/* Don't share DAX file data with non-DAX file. */
@@ -1547,27 +1673,23 @@ xfs_reflink_inode_has_shared_extents(
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
- struct xfs_perag *pag;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
+ xfs_extlen_t shared_offset, shared_len;
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
- agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
- aglen = got.br_blockcount;
- error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
- &rbno, &rlen, false);
- xfs_perag_put(pag);
+ if (XFS_IS_REALTIME_INODE(ip))
+ error = xfs_reflink_find_rtshared(mp, tp, &got,
+ &shared_offset, &shared_len, false);
+ else
+ error = xfs_reflink_find_shared(mp, tp, &got,
+ &shared_offset, &shared_len, false);
if (error)
return error;
/* Is there still a shared block here? */
- if (rbno != NULLAGBLOCK) {
+ if (shared_len) {
*has_shared = true;
return 0;
}
@@ -1700,3 +1822,28 @@ out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
}
+
+/*
+ * Can we use reflink with this realtime extent size? Note that we don't check
+ * for rblocks > 0 here because this can be called as part of attaching a new
+ * rt section.
+ */
+bool
+xfs_reflink_supports_rextsize(
+ struct xfs_mount *mp,
+ unsigned int rextsize)
+{
+ /* reflink on the realtime device requires rtgroups */
+ if (!xfs_has_rtgroups(mp))
+ return false;
+
+ /*
+ * Reflink doesn't support rt extent size larger than a single fsblock
+ * because we would have to perform CoW-around for unaligned write
+ * requests to guarantee that we always remap entire rt extents.
+ */
+ if (rextsize != 1)
+ return false;
+
+ return true;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 4a58e4533671..cc4e92278279 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -25,7 +25,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
return true;
}
-extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool *shared);
@@ -62,4 +62,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
+bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
+
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a69967f9d88e..d8e6d073d64d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -31,6 +31,8 @@
#include "xfs_rtgroup.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_reflink.h"
/*
* Return whether there are any free extents in the size range given
@@ -593,7 +595,7 @@ xfs_rtalloc_sumlevel(
* specified. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int
+static int
xfs_rtallocate_extent_size(
struct xfs_rtalloc_args *args,
xfs_rtxlen_t minlen, /* minimum length to allocate */
@@ -994,6 +996,7 @@ xfs_growfs_rt_bmblock(
*/
mp->m_features |= XFS_FEAT_REALTIME;
xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
kfree(nmp);
return 0;
@@ -1177,6 +1180,7 @@ xfs_growfs_check_rtgeom(
nmp->m_sb.sb_dblocks = dblocks;
xfs_rtrmapbt_compute_maxlevels(nmp);
+ xfs_rtrefcountbt_compute_maxlevels(nmp);
xfs_trans_resv_calc(nmp, M_RES(nmp));
/*
@@ -1289,8 +1293,10 @@ xfs_growfs_rt(
goto out_unlock;
if (xfs_has_quota(mp))
goto out_unlock;
- }
- if (xfs_has_reflink(mp))
+ if (xfs_has_reflink(mp))
+ goto out_unlock;
+ } else if (xfs_has_reflink(mp) &&
+ !xfs_reflink_supports_rextsize(mp, in->extsize))
goto out_unlock;
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
@@ -1547,6 +1553,11 @@ xfs_rt_resv_init(
err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
if (err2 && !error)
error = err2;
+
+ ask = xfs_rtrefcountbt_calc_reserves(mp);
+ err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
+ if (err2 && !error)
+ error = err2;
}
return error;
@@ -1950,7 +1961,7 @@ out_unlock:
goto out_release;
}
-static int
+int
xfs_rtallocate_rtgs(
struct xfs_trans *tp,
xfs_fsblock_t bno_hint,
@@ -2015,7 +2026,10 @@ xfs_rtallocate_align(
if (*noalign) {
align = mp->m_sb.sb_rextsize;
} else {
- align = xfs_get_extsz_hint(ap->ip);
+ if (ap->flags & XFS_BMAPI_COWFORK)
+ align = xfs_get_cowextsz_hint(ap->ip);
+ else
+ align = xfs_get_extsz_hint(ap->ip);
if (!align)
align = 1;
if (align == mp->m_sb.sb_rextsize)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 9044f7226ab6..0d95b29092c9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -77,4 +77,9 @@ xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
}
#endif /* CONFIG_XFS_RT */
+int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint,
+ xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod,
+ bool wasdel, bool initial_user_data, xfs_rtblock_t *bno,
+ xfs_extlen_t *blen);
+
#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index b7f2988bc03b..35c7fb3ba324 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -54,7 +54,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
{ "rcbagbt", xfsstats_offset(xs_rtrmap_2) },
{ "rtrmapbt", xfsstats_offset(xs_rtrmap_mem_2)},
- { "rtrmapbt_mem", xfsstats_offset(xs_qm_dqreclaims)},
+ { "rtrmapbt_mem", xfsstats_offset(xs_rtrefcbt_2) },
+ { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 9c47de5dff2d..15ba1abcf253 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -129,6 +129,7 @@ struct __xfsstats {
uint32_t xs_rcbag_2[__XBTS_MAX];
uint32_t xs_rtrmap_2[__XBTS_MAX];
uint32_t xs_rtrmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rtrefcbt_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ecd5a9f444d8..7c3f996cd39e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1754,9 +1754,11 @@ xfs_fs_fill_super(
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
if (xfs_has_reflink(mp)) {
- if (mp->m_sb.sb_rblocks) {
+ if (xfs_has_realtime(mp) &&
+ !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
xfs_alert(mp,
- "reflink not compatible with realtime device!");
+ "reflink not compatible with realtime extent size %u!",
+ mp->m_sb.sb_rextsize);
error = -EINVAL;
goto out_filestream_unmount;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 84cdc145e2d9..4fe689410eb6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3305,56 +3305,62 @@ TRACE_EVENT(xfs_ag_resv_init_error,
/* refcount tracepoint classes */
DECLARE_EVENT_CLASS(xfs_refcount_class,
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
xfs_extlen_t len),
- TP_ARGS(cur, agbno, len),
+ TP_ARGS(cur, gbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len)
);
#define DEFINE_REFCOUNT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_class, name, \
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \
xfs_extlen_t len), \
- TP_ARGS(cur, agbno, len))
+ TP_ARGS(cur, gbno, len))
TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
TRACE_EVENT(xfs_refcount_lookup,
- TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno,
xfs_lookup_t dir),
- TP_ARGS(cur, agbno, dir),
+ TP_ARGS(cur, gbno, dir),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_lookup_t, dir)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
__entry->dir)
)
@@ -3365,6 +3371,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_ARGS(cur, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
@@ -3373,14 +3380,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
@@ -3396,49 +3405,53 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \
/* single-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
- xfs_agblock_t agbno),
- TP_ARGS(cur, irec, agbno),
+ xfs_agblock_t gbno),
+ TP_ARGS(cur, irec, gbno),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount,
- __entry->agbno)
+ __entry->gbno)
)
#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
- xfs_agblock_t agbno), \
- TP_ARGS(cur, irec, agbno))
+ xfs_agblock_t gbno), \
+ TP_ARGS(cur, irec, gbno))
/* double-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
- struct xfs_refcount_irec *i2),
+ struct xfs_refcount_irec *i2),
TP_ARGS(cur, i1, i2),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3451,6 +3464,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3461,9 +3475,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3484,10 +3499,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
/* double-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
- struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
- TP_ARGS(cur, i1, i2, agbno),
+ struct xfs_refcount_irec *i2, xfs_agblock_t gbno),
+ TP_ARGS(cur, i1, i2, gbno),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3497,10 +3513,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3510,11 +3527,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
- __entry->agbno = agbno;
+ __entry->gbno = gbno;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3524,14 +3542,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
- __entry->agbno)
+ __entry->gbno)
)
#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
- struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
- TP_ARGS(cur, i1, i2, agbno))
+ struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \
+ TP_ARGS(cur, i1, i2, gbno))
/* triple-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
@@ -3540,6 +3558,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_ARGS(cur, i1, i2, i3),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
@@ -3556,6 +3575,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->type = cur->bc_group->xg_type;
__entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
@@ -3570,10 +3590,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
- "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s gbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
@@ -3641,23 +3662,27 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
TP_ARGS(mp, refc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(int, op)
- __field(xfs_agblock_t, agbno)
+ __field(xfs_agblock_t, gbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
+ __entry->type = refc->ri_group->xg_type;
+ __entry->agno = refc->ri_group->xg_gno;
__entry->op = refc->ri_type;
- __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+ __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock,
+ refc->ri_group->xg_type);
__entry->len = refc->ri_blockcount;
),
- TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
+ __entry->gbno,
__entry->len)
);
#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \