diff options
74 files changed, 4328 insertions, 350 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 338e10f81b7b..7afa51e41427 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -51,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_rmap_btree.o \ xfs_refcount.o \ xfs_refcount_btree.o \ + xfs_rtrefcount_btree.o \ xfs_rtrmap_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ @@ -194,6 +195,7 @@ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rgsuper.o \ rtbitmap.o \ + rtrefcount.o \ rtrmap.o \ rtsummary.o \ ) @@ -234,6 +236,7 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ + rtrefcount_repair.o \ rtrmap_repair.o \ rtsummary_repair.o \ ) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 02323936cc9b..40ad22fb808b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4564,8 +4564,9 @@ xfs_bmapi_write( * the refcount btree for orphan recovery. */ if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); + xfs_refcount_alloc_cow_extent(tp, + XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); } /* Deal with the allocated space we found. */ @@ -4740,7 +4741,8 @@ xfs_bmapi_convert_one_delalloc( *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); + xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -5388,7 +5390,7 @@ xfs_bmap_del_extent_real( bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - xfs_refcount_decrease_extent(tp, del); + xfs_refcount_decrease_extent(tp, isrt, del); } else if (isrt && !xfs_has_rtgroups(mp)) { error = xfs_bmap_free_rtblocks(tp, del); } else { @@ -6498,9 +6500,8 @@ xfs_get_extsz_hint( * No point in aligning allocations if we need to COW to actually * write to them. */ - if (xfs_is_always_cow_inode(ip)) - return 0; - if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + if (!xfs_is_always_cow_inode(ip) && + (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) return ip->i_extsize; if (XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1) @@ -6523,7 +6524,13 @@ xfs_get_cowextsz_hint( a = 0; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) a = ip->i_cowextsize; - b = xfs_get_extsz_hint(ip); + if (XFS_IS_REALTIME_INODE(ip)) { + b = 0; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + b = ip->i_extsize; + } else { + b = xfs_get_extsz_hint(ip); + } a = max(a, b); if (a == 0) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 36ab06f8a3bc..299ce7fd11b0 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -35,6 +35,7 @@ #include "xfs_rmap.h" #include "xfs_quota.h" #include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" /* * Btree magic numbers. @@ -5535,6 +5536,9 @@ xfs_btree_init_cur_caches(void) error = xfs_rtrmapbt_init_cur_cache(); if (error) goto err; + error = xfs_rtrefcountbt_init_cur_cache(); + if (error) + goto err; return 0; err: @@ -5552,6 +5556,7 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); xfs_rtrmapbt_destroy_cur_cache(); + xfs_rtrefcountbt_destroy_cur_cache(); } /* Move the btree cursor before the first record. */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index dbc047b2fb2c..355b304696e6 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -297,7 +297,7 @@ struct xfs_btree_cur struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ - } bc_refc; /* refcountbt */ + } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 1e2477eaa5a8..9effd95ddcd4 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -68,6 +68,7 @@ struct xfs_defer_op_type { extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index fba4e59aded4..b1007fb661ba 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -858,6 +858,7 @@ enum xfs_metafile_type { XFS_METAFILE_RTBITMAP, /* rt bitmap */ XFS_METAFILE_RTSUMMARY, /* rt summary */ XFS_METAFILE_RTRMAP, /* rt rmap */ + XFS_METAFILE_RTREFCOUNT, /* rt refcount */ XFS_METAFILE_MAX } __packed; @@ -870,7 +871,8 @@ enum xfs_metafile_type { { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \ - { XFS_METAFILE_RTRMAP, "rtrmap" } + { XFS_METAFILE_RTRMAP, "rtrmap" }, \ + { XFS_METAFILE_RTREFCOUNT, "rtrefcount" } /* * On-disk inode structure. @@ -1790,12 +1792,29 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -#define MAXREFCOUNT ((xfs_nlink_t)~0U) -#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) +#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U) +#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U) /* btree pointer type */ typedef __be32 xfs_refcount_ptr_t; +/* + * Realtime Reference Count btree format definitions + * + * This is a btree for reference count records for realtime volumes + */ +#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */ + +/* + * rt refcount root header, on-disk form only. + */ +struct xfs_rtrefcount_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-rooted btree pointer type */ +typedef __be64 xfs_rtrefcount_ptr_t; /* * BMAP Btree format definitions diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index d42d3a5617e3..2c3171262b44 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -738,9 +738,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ #define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ #define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */ +#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 32 +#define XFS_SCRUB_TYPE_NR 33 /* * This special type code only applies to the vectored scrub implementation. @@ -831,9 +832,10 @@ struct xfs_scrub_vec_head { #define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ #define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ #define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */ +#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */ /* Number of metapath sm_ino values */ -#define XFS_SCRUB_METAPATH_NR (9) +#define XFS_SCRUB_METAPATH_NR (10) /* * ioctl limits @@ -996,6 +998,7 @@ struct xfs_rtgroup_geometry { #define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ #define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ +#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ /* * ioctl commands that are used by Linux filesystems diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 5c8a0aff6ba6..b31000f7190c 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -71,6 +71,7 @@ struct xfs_rtgroup; #define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ #define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ #define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */ +#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */ /* Observable health issues for AG metadata. */ #define XFS_SICK_AG_SB (1 << 0) /* superblock */ @@ -117,7 +118,8 @@ struct xfs_rtgroup; #define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ XFS_SICK_RG_BITMAP | \ XFS_SICK_RG_SUMMARY | \ - XFS_SICK_RG_RMAPBT) + XFS_SICK_RG_RMAPBT | \ + XFS_SICK_RG_REFCNTBT) #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ XFS_SICK_AG_AGF | \ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 17cb91b89fca..f24fa628fecf 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -456,6 +456,11 @@ xfs_dinode_verify_fork( if (!xfs_has_rmapbt(mp)) return __this_address; break; + case XFS_METAFILE_RTREFCOUNT: + /* same comment about growfs and rmap inodes applies */ + if (!xfs_has_reflink(mp)) + return __this_address; + break; default: return __this_address; } @@ -743,7 +748,8 @@ xfs_dinode_verify( return __this_address; /* don't let reflink and realtime mix */ - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && + !xfs_has_rtreflink(mp)) return __this_address; /* COW extent size hint validation */ @@ -904,11 +910,29 @@ xfs_inode_validate_cowextsize( bool rt_flag; bool hint_flag; uint32_t cowextsize_bytes; + uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + /* + * Similar to extent size hints, a directory can be configured to + * propagate realtime status and a CoW extent size hint to newly + * created files even if there is no realtime device, and the hints on + * disk can become misaligned if the sysadmin changes the rt extent + * size while adding the realtime device. + * + * Therefore, we can only enforce the rextsize alignment check against + * regular realtime files, and rely on callers to decide when alignment + * checks are appropriate, and fix things up as needed. + */ + + if (rt_flag) + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + else + blocksize_bytes = mp->m_sb.sb_blocksize; + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -922,16 +946,13 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (hint_flag && rt_flag) - return __this_address; - - if (cowextsize_bytes % mp->m_sb.sb_blocksize) + if (cowextsize_bytes % blocksize_bytes) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; - if (cowextsize > mp->m_sb.sb_agblocks / 2) + if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index d9b3c182cb40..4f99b90add55 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -28,6 +28,7 @@ #include "xfs_health.h" #include "xfs_symlink_remote.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_ifork_cache; @@ -272,6 +273,8 @@ xfs_iformat_data_fork( switch (ip->i_metatype) { case XFS_METAFILE_RTRMAP: return xfs_iformat_rtrmap(ip, dip); + case XFS_METAFILE_RTREFCOUNT: + return xfs_iformat_rtrefcount(ip, dip); default: break; } @@ -620,6 +623,9 @@ xfs_iflush_fork( case XFS_METAFILE_RTRMAP: xfs_iflush_rtrmap(ip, dip); break; + case XFS_METAFILE_RTREFCOUNT: + xfs_iflush_rtrefcount(ip, dip); + break; default: ASSERT(0); break; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a7e0e479454d..ec7157eaba5f 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -252,6 +252,8 @@ typedef struct xfs_trans_header { #define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ #define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */ #define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */ +#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */ +#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -275,7 +277,9 @@ typedef struct xfs_trans_header { { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \ { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \ - { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" } + { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \ + { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \ + { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" } /* * Inode Log Item Format definitions. diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index abc705aff26d..66c7916fb5cd 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -81,6 +81,8 @@ extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; extern const struct xlog_recover_item_ops xlog_rtrui_item_ops; extern const struct xlog_recover_item_ops xlog_rtrud_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcud_item_ops; /* * Macros, structures, prototypes for internal log manager use. diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 07e2f5fb3a94..a85ecddaa48e 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -85,6 +85,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4); /* * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bbb86dc9a25c..cebe83f7842a 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -25,6 +25,9 @@ #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_refcount_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -128,7 +131,7 @@ xfs_refcount_check_irec( struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) return __this_address; if (!xfs_refcount_check_domain(irec)) @@ -138,12 +141,43 @@ xfs_refcount_check_irec( if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) return __this_address; - if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) return __this_address; return NULL; } +xfs_failaddr_t +xfs_rtrefcount_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) + return __this_address; + + if (!xfs_refcount_check_domain(irec)) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount)) + return __this_address; + + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) + return __this_address; + + return NULL; +} + +static inline xfs_failaddr_t +xfs_refcount_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec) +{ + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec); + return xfs_refcount_check_irec(to_perag(cur->bc_group), irec); +} + static inline int xfs_refcount_complain_bad_rec( struct xfs_btree_cur *cur, @@ -152,9 +186,15 @@ xfs_refcount_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, + if (xfs_btree_is_rtrefcount(cur->bc_ops)) { + xfs_warn(mp, + "RT Refcount BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); + } else { + xfs_warn(mp, "Refcount BTree record corruption in AG %d detected at %pS!", cur->bc_group->xg_gno, fa); + } xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -180,7 +220,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec); + fa = xfs_refcount_check_btrec(cur, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -853,9 +893,9 @@ xfs_refc_merge_refcount( const struct xfs_refcount_irec *irec, enum xfs_refc_adjust_op adjust) { - /* Once a record hits MAXREFCOUNT, it is pinned there forever */ - if (irec->rc_refcount == MAXREFCOUNT) - return MAXREFCOUNT; + /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */ + if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX) + return XFS_REFC_REFCOUNT_MAX; return irec->rc_refcount + adjust; } @@ -898,7 +938,7 @@ xfs_refc_want_merge_center( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount + right->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; *ulenp = ulen; @@ -933,7 +973,7 @@ xfs_refc_want_merge_left( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -967,7 +1007,7 @@ xfs_refc_want_merge_right( * hence we need to catch u32 addition overflows here. */ ulen += cright->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -1065,7 +1105,7 @@ xfs_refcount_still_have_space( */ overhead = xfs_allocfree_block_count(cur->bc_mp, cur->bc_refc.shape_changes); - overhead += cur->bc_mp->m_refc_maxlevels; + overhead += cur->bc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -1085,6 +1125,22 @@ xfs_refcount_still_have_space( cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } +/* Schedule an extent free. */ +static int +xrefc_free_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *rec) +{ + unsigned int flags = 0; + + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + flags |= XFS_FREE_EXTENT_REALTIME; + + return xfs_free_extent_later(cur->bc_tp, + xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock), + rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags); +} + /* * Adjust the refcounts of middle extents. At this point we should have * split extents that crossed the adjustment range; merged with adjacent @@ -1101,7 +1157,6 @@ xfs_refcount_adjust_extents( struct xfs_refcount_irec ext, tmp; int error; int found_rec, found_tmp; - xfs_fsblock_t fsbno; /* Merging did all the work already. */ if (*aglen == 0) @@ -1117,7 +1172,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_SHARED; @@ -1154,11 +1209,7 @@ xfs_refcount_adjust_extents( goto out_error; } } else { - fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), - tmp.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + error = xrefc_free_extent(cur, &tmp); if (error) goto out_error; } @@ -1196,7 +1247,7 @@ xfs_refcount_adjust_extents( * Adjust the reference count and either update the tree * (incr) or free the blocks (decr). */ - if (ext.rc_refcount == MAXREFCOUNT) + if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX) goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur, &ext); @@ -1216,11 +1267,7 @@ xfs_refcount_adjust_extents( } goto advloop; } else { - fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), - ext.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + error = xrefc_free_extent(cur, &ext); if (error) goto out_error; } @@ -1417,12 +1464,122 @@ xfs_refcount_finish_one( } /* + * Set up a continuation a deferred rtrefcount operation by updating the + * intent. Checks to make sure we're not going to run off the end of the + * rtgroup. + */ +static inline int +xfs_rtrefcount_continue_op( + struct xfs_btree_cur *cur, + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno, + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno); + + ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount)); + return 0; +} + +/* + * Process one of the deferred realtime refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + */ +int +xfs_rtrefcount_finish_one( + struct xfs_trans *tp, + struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + struct xfs_btree_cur *rcur = *pcur; + int error = 0; + xfs_rgblock_t bno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock); + + trace_xfs_refcount_deferred(mp, ri); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + if (rcur != NULL && rcur->bc_group != ri->ri_group) { + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; + xfs_btree_del_cursor(rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT); + *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg); + + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; + } + + switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + case XFS_REFCOUNT_FREE_COW: + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, ri); + return error; +} + +/* * Record a refcount intent for later processing. */ static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, + bool isrt, xfs_fsblock_t startblock, xfs_extlen_t blockcount) { @@ -1434,6 +1591,7 @@ __xfs_refcount_add( ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + ri->ri_realtime = isrt; xfs_refcount_defer_add(tp, ri); } @@ -1444,12 +1602,13 @@ __xfs_refcount_add( void xfs_refcount_increase_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1459,12 +1618,13 @@ xfs_refcount_increase_extent( void xfs_refcount_decrease_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1666,7 +1826,7 @@ xfs_refcount_adjust_cow_extents( goto out_error; } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_COW; @@ -1820,6 +1980,7 @@ __xfs_refcount_cow_free( void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1828,16 +1989,17 @@ xfs_refcount_alloc_cow_extent( if (!xfs_has_reflink(mp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len); /* Add rmap entry */ - xfs_rmap_alloc_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW); + xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ void xfs_refcount_free_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1847,8 +2009,8 @@ xfs_refcount_free_cow_extent( return; /* Remove rmap entry */ - xfs_rmap_free_extent(tp, false, fsb, len, XFS_RMAP_OWN_COW); - __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len); } struct xfs_refcount_recovery { @@ -1877,8 +2039,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) != - NULL || + if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { xfs_btree_mark_sick(cur); @@ -1893,12 +2054,13 @@ xfs_refcount_recover_extent( /* Find and remove leftover CoW reservations. */ int xfs_refcount_recover_cow_leftovers( - struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_group *xg) { + struct xfs_mount *mp = xg->xg_mount; + bool isrt = xg->xg_type == XG_TYPE_RTG; struct xfs_trans *tp; struct xfs_btree_cur *cur; - struct xfs_buf *agbp; + struct xfs_buf *agbp = NULL; struct xfs_refcount_recovery *rr, *n; struct list_head debris; union xfs_btree_irec low = { @@ -1911,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers( xfs_fsblock_t fsb; int error; - /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + /* reflink filesystems must not have groups larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG); BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); - if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) - return -EOPNOTSUPP; + + if (isrt) { + if (!xfs_has_rtgroups(mp)) + return 0; + if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS) + return -EOPNOTSUPP; + } else { + if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS) + return -EOPNOTSUPP; + } INIT_LIST_HEAD(&debris); @@ -1932,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); - if (error) - goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + if (isrt) { + xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg)); + } else { + error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg)); + } /* Find all the leftover CoW staging extents. */ error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); - xfs_trans_brelse(tp, agbp); + if (agbp) + xfs_trans_brelse(tp, agbp); + else + xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); xfs_trans_cancel(tp); if (error) goto out_free; @@ -1954,14 +2133,15 @@ xfs_refcount_recover_cow_leftovers( goto out_free; /* Free the orphan record */ - fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock); - xfs_refcount_free_cow_extent(tp, fsb, + fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock); + xfs_refcount_free_cow_extent(tp, isrt, fsb, rr->rr_rrec.rc_blockcount); /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) goto out_trans; @@ -2026,7 +2206,7 @@ xfs_refcount_query_range_helper( xfs_failaddr_t fa; xfs_refcount_btrec_to_irec(rec, &irec); - fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec); + fa = xfs_refcount_check_btrec(cur, &irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 62d78afcf1f3..f2e299a716a4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -12,6 +12,7 @@ struct xfs_perag; struct xfs_btree_cur; struct xfs_bmbt_irec; struct xfs_refcount_irec; +struct xfs_rtgroup; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); @@ -60,6 +61,7 @@ struct xfs_refcount_intent { enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; + bool ri_realtime; }; /* Check that the refcount is appropriate for the record domain. */ @@ -74,24 +76,25 @@ xfs_refcount_check_domain( return true; } -void xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -void xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -extern int xfs_refcount_finish_one(struct xfs_trans *tp, +int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); +int xfs_rtrefcount_finish_one(struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - struct xfs_perag *pag); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg); /* * While we're adjusting the refcounts records of an extent, we have @@ -120,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index f8415fd96cc2..3cdf50563fec 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -285,6 +285,13 @@ xfs_rtrmap_check_meta_irec( if (irec->rm_blockcount != mp->m_sb.sb_rextsize) return __this_address; return NULL; + case XFS_RMAP_OWN_COW: + if (!xfs_has_rtreflink(mp)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; + return NULL; default: return __this_address; } diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index b7ed2d27d545..a6468e591232 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -34,6 +34,7 @@ #include "xfs_metafile.h" #include "xfs_metadir.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* Find the first usable fsblock in this rtgroup. */ static inline uint32_t @@ -206,6 +207,9 @@ xfs_rtgroup_lock( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL); } /* Unlock metadata inodes associated with this rt group. */ @@ -218,6 +222,9 @@ xfs_rtgroup_unlock( ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL); + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); @@ -249,6 +256,9 @@ xfs_rtgroup_trans_join( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL); } /* Retrieve rt group geometry. */ @@ -367,6 +377,15 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .enabled = xfs_has_rmapbt, .create = xfs_rtrmapbt_create, }, + [XFS_RTGI_REFCOUNT] = { + .name = "refcount", + .metafile_type = XFS_METAFILE_RTREFCOUNT, + .sick = XFS_SICK_RG_REFCNTBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* same comment about growfs and rmap inodes applies here */ + .enabled = xfs_has_reflink, + .create = xfs_rtrefcountbt_create, + }, }; /* Return the shortname of this rtgroup inode. */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 6ff222a05367..03f39d4e43fc 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -15,6 +15,7 @@ enum xfs_rtg_inodes { XFS_RTGI_BITMAP, /* allocation bitmap */ XFS_RTGI_SUMMARY, /* allocation summary */ XFS_RTGI_RMAP, /* rmap btree inode */ + XFS_RTGI_REFCOUNT, /* refcount btree inode */ XFS_RTGI_MAX, }; @@ -80,6 +81,11 @@ static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg) return rtg->rtg_inodes[XFS_RTGI_RMAP]; } +static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_REFCOUNT]; +} + /* Passive rtgroup references */ static inline struct xfs_rtgroup * xfs_rtgroup_get( @@ -267,10 +273,13 @@ int xfs_update_last_rtgroup_size(struct xfs_mount *mp, #define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) /* Lock the rt rmap inode in exclusive mode */ #define XFS_RTGLOCK_RMAP (1U << 2) +/* Lock the rt refcount inode in exclusive mode */ +#define XFS_RTGLOCK_REFCOUNT (1U << 3) #define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_BITMAP_SHARED | \ - XFS_RTGLOCK_RMAP) + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c new file mode 100644 index 000000000000..3db5e7a4a945 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_health.h" + +static struct kmem_cache *xfs_rtrefcountbt_cur_cache; + +/* + * Realtime Reference Count btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_refcount_btree.c for more information. + * + * This tree is basically the same as the regular refcount btree except that + * it's rooted in an inode. + */ + +static struct xfs_btree_cur * +xfs_rtrefcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrefcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrefc_mnr[level != 0]; +} + +STATIC int +xfs_rtrefcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrefc_mxr[level != 0]; +} + +/* + * Calculate number of records in a realtime refcount btree inode root. + */ +unsigned int +xfs_rtrefcountbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrefcount_root); + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (2 * sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork so that + * we can resize the in-memory buffer to match it. After a resize to the + * maximum size this function returns the same value as + * xfs_rtrefcountbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrefcountbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrefc_mxr[level != 0]; + return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +STATIC void +xfs_rtrefcountbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_rtrefcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_rtrefcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_rtrefcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC int64_t +xfs_rtrefcountbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; +} + +STATIC int64_t +xfs_rtrefcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +static xfs_failaddr_t +xfs_rtrefcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_reflink(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrefc_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); +} + +static void +xfs_rtrefcountbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrefcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrefcountbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrefcountbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { + .name = "xfs_rtrefcountbt", + .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, + .verify_read = xfs_rtrefcountbt_read_verify, + .verify_write = xfs_rtrefcountbt_write_verify, + .verify_struct = xfs_rtrefcountbt_verify, +}; + +STATIC int +xfs_rtrefcountbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_rtrefcountbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} + +STATIC enum xbtree_key_contig +xfs_rtrefcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + +static inline void +xfs_rtrefcountbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrefcountbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, + new_size, old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, + new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrefcount_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrefcountbt_ops = { + .name = "rtrefcount", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), + .sick_mask = XFS_SICK_RG_REFCNTBT, + + .dup_cursor = xfs_rtrefcountbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrefcountbt_get_minrecs, + .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, + .key_diff = xfs_rtrefcountbt_key_diff, + .buf_ops = &xfs_rtrefcountbt_buf_ops, + .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .keys_inorder = xfs_rtrefcountbt_keys_inorder, + .recs_inorder = xfs_rtrefcountbt_recs_inorder, + .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, + .broot_realloc = xfs_rtrefcountbt_broot_realloc, +}; + +/* Allocate a new rt refcount btree cursor. */ +struct xfs_btree_cur * +xfs_rtrefcountbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_refcount(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, + mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + cur->bc_ino.whichfork = XFS_DATA_FORK; + return cur; +} + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrefcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a realtime refcount btree block. */ +static inline unsigned int +xfs_rtrefcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Calculate number of records in an refcount btree block. + */ +unsigned int +xfs_rtrefcountbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; + return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime refcount btrees. */ +unsigned int +xfs_rtrefcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; + + /* We need at most one record for every block in an rt group. */ + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); +} + +int __init +xfs_rtrefcountbt_init_cur_cache(void) +{ + xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", + xfs_btree_cur_sizeof( + xfs_rtrefcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrefcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrefcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); + xfs_rtrefcountbt_cur_cache = NULL; +} + +/* Compute the maximum height of a realtime refcount btree. */ +void +xfs_rtrefcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtreflink(mp)) { + mp->m_rtrefc_maxlevels = 0; + return; + } + + /* + * The realtime refcountbt lives on the data device, which means that + * its maximum height is constrained by the size of the data device and + * the height required to store one refcount record for each rtextent + * in an rt group. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, + mp->m_sb.sb_dblocks); + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, + mp->m_sb.sb_rgextents); + + /* Add one level to handle the inode root level. */ + mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrefcount btree size for some records. */ +unsigned long long +xfs_rtrefcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +static unsigned long long +xfs_rtrefcountbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrefc_mxr[0] == 0) + return 0; + + return xfs_rtrefcountbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + * We need enough space to hold one record for every rt extent in the rtgroup. + */ +xfs_filblks_t +xfs_rtrefcountbt_calc_reserves( + struct xfs_mount *mp) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +STATIC void +xfs_rtrefcountbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrefcount_root *dblock, + int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + unsigned int rblocklen; + + rblocklen = xfs_rtrefcount_broot_space(mp, dblock); + + xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + tkp = xfs_rtrefcount_key_addr(rblock, 1); + fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + trp = xfs_rtrefcount_rec_addr(rblock, 1); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reference count btree root in from disk. */ +int +xfs_iformat_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrefcount inodes before adding a realtime + * volume to the filesystem, so we cannot use the rtrefcount predicate + * here. + */ + if (!xfs_has_reflink(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrefc_maxlevels || + xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_rtrefcountbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + struct xfs_rtrefcount_root *dblock, + int dblocklen) +{ + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int maxrecs; + unsigned int numrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_key_addr(rblock, 1); + tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_rec_addr(rblock, 1); + trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reference count btree root out to disk. */ +void +xfs_iflush_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, + ifp->if_broot_bytes, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime refcount btree inode. + */ +int +xfs_rtrefcountbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, + xfs_rtrefcount_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h new file mode 100644 index 000000000000..a99b7a8aec86 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTREFCOUNT_BTREE_H__ +#define __XFS_RTREFCOUNT_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; + +/* refcounts only exist on crc enabled filesystems */ +#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp, + unsigned int blocklen, bool leaf); +void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrefcountbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_refcount_rec * +xfs_rtrefcount_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void); +int __init xfs_rtrefcountbt_init_cur_cache(void); +void xfs_rtrefcountbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp); +unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */ + +static inline struct xfs_refcount_rec * +xfs_rtrefcount_droot_rec_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_droot_key_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_droot_ptr_addr( + struct xfs_rtrefcount_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrefcount_ptr_addr(bb, index, + xfs_rtrefcountbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrefcount_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTREFCOUNT_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb) +{ + return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrefcount_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrefcount_root); + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrefcount_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, + struct xfs_rtrefcount_root *dblock, int dblocklen); +void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + +#endif /* __XFS_RTREFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index b90901e39e92..e4ec36943cb7 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -718,6 +718,7 @@ xfs_rtrmapbt_maxrecs( unsigned int xfs_rtrmapbt_maxlevels_ondisk(void) { + unsigned long long max_dblocks; unsigned int minrecs[2]; unsigned int blocklen; @@ -726,8 +727,20 @@ xfs_rtrmapbt_maxlevels_ondisk(void) minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; - /* We need at most one record for every block in an rt group. */ - return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); + /* + * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs. + * + * On a reflink filesystem, each block in an rtgroup can have up to + * 2^32 (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. However, we're + * likely to run out of blocks in the data device long before that + * happens, which means that we must compute the max height based on + * what the btree will look like if it consumes almost all the blocks + * in the data device due to maximal sharing factor. + */ + max_dblocks = -1U; /* max ag count */ + max_dblocks *= XFS_MAX_CRC_AG_BLOCKS; + return xfs_btree_space_to_height(minrecs, max_dblocks); } int __init @@ -766,9 +779,20 @@ xfs_rtrmapbt_compute_maxlevels( * maximum height is constrained by the size of the data device and * the height required to store one rmap record for each block in an * rt group. + * + * On a reflink filesystem, each rt block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. This makes the computation of + * maxlevels based on record count meaningless, so we only consider the + * size of the data device. */ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, mp->m_sb.sb_dblocks); + if (xfs_has_rtreflink(mp)) { + mp->m_rtrmap_maxlevels = d_maxlevels + 1; + return; + } + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, mp->m_groups[XG_TYPE_RTG].blocks); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 83fb14b4074c..3dc5f5dba162 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -29,6 +29,7 @@ #include "xfs_exchrange.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -1226,6 +1227,13 @@ xfs_sb_mount_common( mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; + mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 960716c387cc..b1e0d9bc1f7d 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -42,6 +42,7 @@ extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops; extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; extern const struct xfs_buf_ops xfs_rtsb_buf_ops; +extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops; extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; @@ -58,6 +59,7 @@ extern const struct xfs_btree_ops xfs_rmapbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; extern const struct xfs_btree_ops xfs_rtrmapbt_ops; extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrefcountbt_ops; static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) { @@ -114,6 +116,11 @@ static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops) return ops == &xfs_rtrmapbt_ops; } +static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrefcountbt_ops; +} + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f3392eb2d7f4..13d00c7166e1 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -92,6 +92,14 @@ xfs_refcountbt_block_count( return num_ops * (2 * mp->m_refc_maxlevels - 1); } +static unsigned int +xfs_rtrefcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_rtrefc_maxlevels - 1); +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -259,10 +267,13 @@ xfs_rtalloc_block_count( * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * - * This is calculated as: + * This is calculated as the max of: * Data device refcount updates (t1): * the agfs of the ags containing the blocks: nr_ops * sector size * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size */ static unsigned int xfs_calc_refcountbt_reservation( @@ -270,12 +281,20 @@ xfs_calc_refcountbt_reservation( unsigned int nr_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int t1, t2 = 0; if (!xfs_has_reflink(mp)) return 0; - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + + if (xfs_has_realtime(mp)) + t2 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + blksz); + + return max(t1, t2); } /* diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index b45d2b32051a..cd6f0223879f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -647,7 +647,7 @@ xrep_agfl_fill( xfs_agblock_t agbno = start; int error; - trace_xrep_agfl_insert(sc->sa.pag, agbno, len); + trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len); while (agbno < start + len && af->fl_off < af->flcount) af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f6077b0cba8a..66da7d4d56ba 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -347,13 +347,31 @@ xchk_bmap_rt_iextent_xref( goto out_cur; rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock); - xchk_bmap_xref_rmap(info, irec, rgbno); - - xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, info->whichfork, - irec->br_startoff); - xchk_xref_is_only_rt_owned_by(info->sc, rgbno, - irec->br_blockcount, &oinfo); + switch (info->whichfork) { + case XFS_DATA_FORK: + xchk_bmap_xref_rmap(info, irec, rgbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + } + xchk_xref_is_not_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, rgbno); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &XFS_RMAP_OINFO_COW); + xchk_xref_is_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + break; + } out_cur: xchk_rtgroup_btcur_free(&info->sc->sr); out_free: diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index fd64bdf4e138..1084213b8e9b 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -101,14 +101,21 @@ xrep_bmap_discover_shared( xfs_filblks_t blockcount) { struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *cur; xfs_agblock_t agbno; xfs_agblock_t fbno; xfs_extlen_t flen; int error; - agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); - error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, - &fbno, &flen, false); + if (XFS_IS_REALTIME_INODE(sc->ip)) { + agbno = xfs_rtb_to_rgbno(sc->mp, startblock); + cur = sc->sr.refc_cur; + } else { + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + cur = sc->sa.refc_cur; + } + error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen, + false); if (error) return error; @@ -450,7 +457,9 @@ xrep_bmap_scan_rtgroup( return 0; error = xrep_rtgroup_init(sc, rtg, &sc->sr, - XFS_RTGLOCK_RMAP | XFS_RTGLOCK_BITMAP_SHARED); + XFS_RTGLOCK_RMAP | + XFS_RTGLOCK_REFCOUNT | + XFS_RTGLOCK_BITMAP_SHARED); if (error) return error; @@ -903,10 +912,6 @@ xrep_bmap_init_reflink_scan( if (whichfork != XFS_DATA_FORK) return RLS_IRRELEVANT; - /* cannot share realtime extents */ - if (XFS_IS_REALTIME_INODE(sc->ip)) - return RLS_IRRELEVANT; - return RLS_UNKNOWN; } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 06cb61e63498..28ad341df8ee 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -37,6 +37,7 @@ #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" #include "xfs_bmap_util.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -797,6 +798,9 @@ xchk_rtgroup_lock( if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP)) sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); + return 0; } @@ -811,7 +815,10 @@ xchk_rtgroup_btcur_free( { if (sr->rmap_cur) xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR); + if (sr->refc_cur) + xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR); + sr->refc_cur = NULL; sr->rmap_cur = NULL; } @@ -1687,6 +1694,9 @@ xchk_meta_btree_count_blocks( case XFS_METAFILE_RTRMAP: cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg); + break; default: ASSERT(0); return -EFSCORRUPTED; diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 50ac6cca18fe..bdcd40f0ec74 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -82,11 +82,13 @@ int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); int xchk_setup_rgsuperblock(struct xfs_scrub *sc); int xchk_setup_rtrmapbt(struct xfs_scrub *sc); +int xchk_setup_rtrefcountbt(struct xfs_scrub *sc); #else # define xchk_setup_rtbitmap xchk_setup_nothing # define xchk_setup_rtsummary xchk_setup_nothing # define xchk_setup_rgsuperblock xchk_setup_nothing # define xchk_setup_rtrmapbt xchk_setup_nothing +# define xchk_setup_rtrefcountbt xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); @@ -129,7 +131,8 @@ xchk_ag_init_existing( /* All the locks we need to check an rtgroup. */ #define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ - XFS_RTGLOCK_RMAP) + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno, struct xchk_rt *sr); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 5b6194cef3e5..38a246b8bf11 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -26,6 +26,9 @@ #include "xfs_errortag.h" #include "xfs_icache.h" #include "xfs_refcount_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -34,6 +37,7 @@ #include "scrub/bitmap.h" #include "scrub/off_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -61,7 +65,10 @@ struct xrep_cow { struct xoff_bitmap bad_fileoffs; /* Bitmap of fsblocks that were removed from the CoW fork. */ - struct xfsb_bitmap old_cowfork_fsblocks; + union { + struct xfsb_bitmap old_cowfork_fsblocks; + struct xrtb_bitmap old_cowfork_rtblocks; + }; /* CoW fork mappings used to scan for bad CoW staging extents. */ struct xfs_bmbt_irec irec; @@ -145,8 +152,7 @@ xrep_cow_mark_shared_staging( xrep_cow_trim_refcount(xc, &rrec, rec); return xrep_cow_mark_file_range(xc, - xfs_agbno_to_fsb(to_perag(cur->bc_group), - rrec.rc_startblock), + xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock), rrec.rc_blockcount); } @@ -177,9 +183,8 @@ xrep_cow_mark_missing_staging( if (xc->next_bno >= rrec.rc_startblock) goto next; - error = xrep_cow_mark_file_range(xc, - xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno), + xfs_gbno_to_fsb(cur->bc_group, xc->next_bno), rrec.rc_startblock - xc->next_bno); if (error) return error; @@ -222,8 +227,7 @@ xrep_cow_mark_missing_staging_rmap( } return xrep_cow_mark_file_range(xc, - xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno), - rec_len); + xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len); } /* @@ -311,6 +315,92 @@ out_pag: } /* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad_rt( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_scrub *sc = xc->sc; + struct xfs_rtgroup *rtg; + int error = 0; + + xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock); + + rtg = xfs_rtgroup_get(sc->mp, + xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock)); + if (!rtg) + return -EFSCORRUPTED; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); + if (error) + goto out_rtg; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sr; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sr; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + xfs_rgbno_to_rtb(rtg, xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sr; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sr; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone + * turned on the debugging knob, replace everything in the + * CoW fork and then scan for staging extents in the refcountbt. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + goto out_rtg; + } + +out_sr: + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); +out_rtg: + xfs_rtgroup_put(rtg); + return error; +} + +/* * Allocate a replacement CoW staging extent of up to the given number of * blocks, and fill out the mapping. */ @@ -343,7 +433,7 @@ xrep_cow_alloc( if (args.fsbno == NULLFSBLOCK) return -ENOSPC; - xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len); repl->fsbno = args.fsbno; repl->len = args.len; @@ -351,6 +441,32 @@ xrep_cow_alloc( } /* + * Allocate a replacement rt CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc_rt( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen); + int error; + + error = xfs_trans_reserve_more(sc->tp, 0, maxrtx); + if (error) + return error; + + error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false, + false, &repl->fsbno, &repl->len); + if (error) + return error; + + xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len); + return 0; +} + +/* * Look up the current CoW fork mapping so that we only allocate enough to * replace a single mapping. If we don't find a mapping that covers the start * of the file range, or we find a delalloc or written extent, something is @@ -467,7 +583,10 @@ xrep_cow_replace_range( */ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, nextoff - startoff); - error = xrep_cow_alloc(sc, alloc_len, &repl); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_alloc_rt(sc, alloc_len, &repl); + else + error = xrep_cow_alloc(sc, alloc_len, &repl); if (error) return error; @@ -483,8 +602,12 @@ xrep_cow_replace_range( return error; /* Note the old CoW staging extents; we'll reap them all later. */ - error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, - repl.len); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks, + got.br_startblock, repl.len); + else + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, + got.br_startblock, repl.len); if (error) return error; @@ -540,8 +663,16 @@ xrep_bmap_cow( if (!ifp) return 0; - /* realtime files aren't supported yet */ - if (XFS_IS_REALTIME_INODE(sc->ip)) + /* + * Realtime files with large extent sizes are not supported because + * we could encounter an CoW mapping that has been partially written + * out *and* requires replacement, and there's no solution to that. + */ + if (xfs_inode_has_bigrtalloc(sc->ip)) + return -EOPNOTSUPP; + + /* Metadata inodes aren't supposed to have data on the rt volume. */ + if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip)) return -EOPNOTSUPP; /* @@ -562,7 +693,10 @@ xrep_bmap_cow( xc->sc = sc; xoff_bitmap_init(&xc->bad_fileoffs); - xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_init(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); for_each_xfs_iext(ifp, &icur, &xc->irec) { if (xchk_should_terminate(sc, &error)) @@ -585,7 +719,10 @@ xrep_bmap_cow( if (xfs_bmap_is_written_extent(&xc->irec)) continue; - error = xrep_cow_find_bad(xc); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_find_bad_rt(xc); + else + error = xrep_cow_find_bad(xc); if (error) goto out_bitmap; } @@ -600,13 +737,20 @@ xrep_bmap_cow( * by the refcount btree, not the inode, so it is correct to treat them * like inode metadata. */ - error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, - &XFS_RMAP_OINFO_COW); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks, + &XFS_RMAP_OINFO_COW); + else + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); if (error) goto out_bitmap; out_bitmap: - xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); xoff_bitmap_destroy(&xc->bad_fileoffs); kfree(xc); return error; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index bcc4244e3b55..3c0f25098b69 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -115,6 +115,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH }, [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER }, [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT }, + [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 8e702121dc86..db6edd5a5fe5 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -260,12 +260,7 @@ xchk_inode_extsize( xchk_ino_set_warning(sc, ino); } -/* - * Validate di_cowextsize hint. - * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. - */ +/* Validate di_cowextsize hint. */ STATIC void xchk_inode_cowextsize( struct xfs_scrub *sc, @@ -276,12 +271,25 @@ xchk_inode_cowextsize( uint64_t flags2) { xfs_failaddr_t fa; + uint32_t value = be32_to_cpu(dip->di_cowextsize); - fa = xfs_inode_validate_cowextsize(sc->mp, - be32_to_cpu(dip->di_cowextsize), mode, flags, - flags2); + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); + + /* + * XFS allows a sysadmin to change the rt extent size when adding a rt + * section to a filesystem after formatting. If there are any + * directories with cowextsize and rtinherit set, the hint could become + * misaligned with the new rextsize. The verifier doesn't check this, + * because we allow rtinherit directories even without an rt device. + * Flag this as an administrative warning since we will clean this up + * eventually. + */ + if ((flags & XFS_DIFLAG_RTINHERIT) && + (flags2 & XFS_DIFLAG2_COWEXTSIZE) && + value % sc->mp->m_sb.sb_rextsize > 0) + xchk_ino_set_warning(sc, ino); } /* Make sure the di_flags make sense for the inode. */ @@ -360,8 +368,9 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) goto bad; - /* realtime and reflink make no sense, currently */ - if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + /* realtime and reflink don't always go together */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_has_rtreflink(mp)) goto bad; /* no bigtime iflag without the bigtime feature */ diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index d7e3f033b160..2f641b6d663e 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -40,6 +40,7 @@ #include "xfs_symlink_remote.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -564,8 +565,6 @@ xrep_dinode_flags( flags2 |= XFS_DIFLAG2_REFLINK; else flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); - if (flags & XFS_DIFLAG_REALTIME) - flags2 &= ~XFS_DIFLAG2_REFLINK; if (!xfs_has_bigtime(mp)) flags2 &= ~XFS_DIFLAG2_BIGTIME; if (!xfs_has_large_extent_counts(mp)) @@ -972,6 +971,34 @@ xrep_dinode_bad_rtrmapbt_fork( return false; } +/* Return true if this refcount-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_rtrefcountbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size) +{ + struct xfs_rtrefcount_root *dfp; + unsigned int nrecs; + unsigned int level; + + if (dfork_size < sizeof(struct xfs_rtrefcount_root)) + return true; + + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > sc->mp->m_rtrefc_maxlevels) + return true; + if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size) + return true; + if (level > 0 && nrecs == 0) + return true; + + return false; +} + /* Check a metadata-btree fork. */ STATIC bool xrep_dinode_bad_metabt_fork( @@ -986,6 +1013,8 @@ xrep_dinode_bad_metabt_fork( switch (be16_to_cpu(dip->di_metatype)) { case XFS_METAFILE_RTRMAP: return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size); + case XFS_METAFILE_RTREFCOUNT: + return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size); default: return true; } @@ -1251,6 +1280,7 @@ xrep_dinode_ensure_forkoff( { struct xfs_bmdr_block *bmdr; struct xfs_rtrmap_root *rmdr; + struct xfs_rtrefcount_root *rcdr; struct xfs_scrub *sc = ri->sc; xfs_extnum_t attr_extents, data_extents; size_t bmdr_minsz = xfs_bmdr_space_calc(1); @@ -1363,6 +1393,10 @@ xrep_dinode_ensure_forkoff( rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr); break; + case XFS_METAFILE_RTREFCOUNT: + rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr); + break; default: dfork_min = 0; break; @@ -1790,10 +1824,6 @@ xrep_inode_flags( /* DAX only applies to files and dirs. */ if (!(S_ISREG(mode) || S_ISDIR(mode))) sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; - - /* No reflink files on the realtime device. */ - if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) - sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; } /* @@ -1909,6 +1939,20 @@ xrep_inode_pptr( sizeof(struct xfs_attr_sf_hdr), true); } +/* Fix COW extent size hint problems. */ +STATIC void +xrep_inode_cowextsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned CoW extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) { + sc->ip->i_cowextsize = 0; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + } +} + /* Fix any irregularities in an inode that the verifiers don't catch. */ STATIC int xrep_inode_problems( @@ -1932,6 +1976,7 @@ xrep_inode_problems( if (S_ISDIR(VFS_I(sc->ip)->i_mode)) xrep_inode_dir_size(sc); xrep_inode_extsize(sc); + xrep_inode_cowextsize(sc); trace_xrep_inode_fixed(sc); xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 74d71373e7ed..e21c16fbd15d 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -22,6 +22,7 @@ #include "xfs_attr.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -249,6 +250,8 @@ xchk_setup_metapath( return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ); case XFS_SCRUB_METAPATH_RTRMAPBT: return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP); + case XFS_SCRUB_METAPATH_RTREFCOUNTBT: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT); default: return -ENOENT; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 183d531875ea..58d6d4ed2853 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -212,12 +212,18 @@ xchk_quota_item( if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, + offset); } else { if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); } - if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index cd51f10f2920..8f4c8d41f308 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -233,7 +233,7 @@ xrep_quota_item( rqi->need_quotacheck = true; dirty = true; } - if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) { dq->q_rtb.reserved -= dq->q_rtb.count; dq->q_rtb.reserved += mp->m_sb.sb_rblocks; dq->q_rtb.count = mp->m_sb.sb_rblocks; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 4d7f1b82dc55..b32fb233cf84 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -34,6 +34,8 @@ #include "xfs_attr_remote.h" #include "xfs_defer.h" #include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -41,6 +43,7 @@ #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -311,7 +314,7 @@ xreap_agextent_binval( } out: - trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); + trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp); } /* @@ -370,7 +373,8 @@ xreap_agextent_select( out_found: *aglenp = len; - trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); + trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); out_cur: xfs_btree_del_cursor(cur, error); return error; @@ -409,7 +413,8 @@ xreap_agextent_iter( * to run xfs_repair. */ if (crosslinked) { - trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, + *aglenp); rs->force_roll = true; @@ -419,7 +424,8 @@ xreap_agextent_iter( * records from the refcountbt, which will remove the * rmap record as well. */ - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, + *aglenp); return 0; } @@ -427,7 +433,7 @@ xreap_agextent_iter( *aglenp, rs->oinfo); } - trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); /* * Invalidate as many buffers as we can, starting at agbno. If this @@ -451,7 +457,7 @@ xreap_agextent_iter( if (rs->oinfo == &XFS_RMAP_OINFO_COW) { ASSERT(rs->resv == XFS_AG_RESV_NONE); - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) @@ -678,6 +684,225 @@ xrep_reap_fsblocks( return 0; } +#ifdef CONFIG_XFS_RT +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. Units are rt blocks, not rt extents. + */ +STATIC int +xreap_rgextent_select( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_rgblock_t rgbno_next, + bool *crosslinked, + xfs_extlen_t *rglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_rgblock_t bno = rgbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); + error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < rgbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + + *rglenp = len; + trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len, + *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this rtgroup extent as possible. + * The number of blocks disposed of will be returned in @rglenp. + */ +STATIC int +xreap_rgextent_iter( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_extlen_t *rglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_rtblock_t rtbno; + int error; + + /* + * The only caller so far is CoW fork repair, so we only know how to + * unlink or free CoW staging extents. Here we don't have to worry + * about invalidating buffers! + */ + if (rs->oinfo != &XFS_RMAP_OINFO_COW) { + ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW); + return -EFSCORRUPTED; + } + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the forward and reverse mapping and move on. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno, + *rglenp); + + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + rs->deferred++; + return 0; + } + + trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); + + /* + * The CoW staging extent is not crosslinked. Use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL, + rs->resv, + XFS_FREE_EXTENT_REALTIME | + XFS_FREE_EXTENT_SKIP_DISCARD); + if (error) + return error; + + rs->deferred++; + return 0; +} + +#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +/* + * Break a rt file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * be aligned to a realtime extent. + */ +STATIC int +xreap_rtmeta_extent( + uint64_t rtbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno); + xfs_rgblock_t rgbno_next = rgbno + len; + int error = 0; + + ASSERT(sc->ip != NULL); + ASSERT(!sc->sr.rtg); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno)); + if (!sc->sr.rtg) + return -EFSCORRUPTED; + + xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + + while (rgbno < rgbno_next) { + xfs_extlen_t rglen; + bool crosslinked; + + error = xreap_rgextent_select(rs, rgbno, rgbno_next, + &crosslinked, &rglen); + if (error) + goto out_unlock; + + error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked); + if (error) + goto out_unlock; + + if (xreap_want_defer_finish(rs)) { + error = xfs_defer_finish(&sc->tp); + if (error) + goto out_unlock; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + if (error) + goto out_unlock; + xreap_reset(rs); + } + + rgbno += rglen; + } + +out_unlock: + xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + xfs_rtgroup_put(sc->sr.rtg); + sc->sr.rtg = NULL; + return error; +} + +/* + * Dispose of every block of every rt metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_rtblocks( + struct xfs_scrub *sc, + struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* * Dispose of every block of an old metadata btree that used to be rooted in a * metadata directory file. @@ -770,7 +995,8 @@ xreap_bmapi_select( } imap->br_blockcount = len; - trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); + trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); out_cur: xfs_btree_del_cursor(cur, error); return error; @@ -909,7 +1135,8 @@ xreap_bmapi_binval( } out: - trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); + trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno, + imap->br_blockcount); return 0; } @@ -936,7 +1163,7 @@ xrep_reap_bmapi_iter( * anybody else who thinks they own the block, even though that * runs the risk of stale buffer warnings in the future. */ - trace_xreap_dispose_unmap_extent(sc->sa.pag, + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), imap->br_blockcount); @@ -959,7 +1186,7 @@ xrep_reap_bmapi_iter( * by a block starting before the first block of the extent but overlap * anyway. */ - trace_xreap_dispose_free_extent(sc->sa.pag, + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), imap->br_blockcount); diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index 70e5e6bbb8d3..4c8f62701fb3 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -17,6 +17,13 @@ int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap); +#ifdef CONFIG_XFS_RT +int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); +#else +# define xrep_reap_rtblocks(...) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + /* Buffer cache scan context. */ struct xrep_bufscan { /* Disk address for the buffers we want to scan. */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 1c5e45cc6419..d46528023015 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -421,7 +421,7 @@ xchk_refcount_mergeable( if (r1->rc_refcount != r2->rc_refcount) return false; if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > - MAXREFCEXTLEN) + XFS_REFC_LEN_MAX) return false; return true; diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index 4e572b81c986..9c8cb5332da0 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -183,13 +183,13 @@ xrep_refc_stash( if (xchk_should_terminate(sc, &error)) return error; - irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); error = xrep_refc_check_ext(rr->sc, &irec); if (error) return error; - trace_xrep_refc_found(sc->sa.pag, &irec); + trace_xrep_refc_found(pag_group(sc->sa.pag), &irec); return xfarray_append(rr->refcount_records, &irec); } @@ -422,7 +422,7 @@ xrep_refc_find_refcounts( /* * Set up a bag to store all the rmap records that we're tracking to * generate a reference count record. If the size of the bag exceeds - * MAXREFCOUNT, we clamp rc_refcount. + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. */ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 61e414c81253..3b5288d3ef4e 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -42,6 +42,7 @@ #include "xfs_rtgroup.h" #include "xfs_rtalloc.h" #include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1009,6 +1010,11 @@ xrep_rtgroup_btcur_init( (sr->rtlock_flags & XFS_RTGLOCK_RMAP) && xfs_has_rtrmapbt(mp)) sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT && + (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) && + xfs_has_rtreflink(mp)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index ac5962732d26..823c00d1a502 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -50,7 +50,9 @@ xrep_trans_commit( struct xbitmap; struct xagb_bitmap; +struct xrgb_bitmap; struct xfsb_bitmap; +struct xrtb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); @@ -98,6 +100,7 @@ int xrep_setup_nlinks(struct xfs_scrub *sc); int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); int xrep_setup_dirtree(struct xfs_scrub *sc); int xrep_setup_rtrmapbt(struct xfs_scrub *sc); +int xrep_setup_rtrefcountbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -157,11 +160,13 @@ int xrep_rtbitmap(struct xfs_scrub *sc); int xrep_rtsummary(struct xfs_scrub *sc); int xrep_rgsuperblock(struct xfs_scrub *sc); int xrep_rtrmapbt(struct xfs_scrub *sc); +int xrep_rtrefcountbt(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported # define xrep_rtsummary xrep_notsupported # define xrep_rgsuperblock xrep_notsupported # define xrep_rtrmapbt xrep_notsupported +# define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -235,6 +240,7 @@ xrep_setup_nothing( #define xrep_setup_dirtree xrep_setup_nothing #define xrep_setup_metapath xrep_setup_nothing #define xrep_setup_rtrmapbt xrep_setup_nothing +#define xrep_setup_rtrefcountbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) @@ -273,6 +279,7 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) #define xrep_metapath xrep_notsupported #define xrep_rgsuperblock xrep_notsupported #define xrep_rtrmapbt xrep_notsupported +#define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h new file mode 100644 index 000000000000..4c3126b66dcb --- /dev/null +++ b/fs/xfs/scrub/rgb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RGB_BITMAP_H__ +#define __XFS_SCRUB_RGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rgblock_t */ + +struct xrgb_bitmap { + struct xbitmap32 rgbitmap; +}; + +static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->rgbitmap); +} + +static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->rgbitmap); +} + +static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap, + xfs_rgblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->rgbitmap, start, len); +} + +static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->rgbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index c2c7b76cc25a..f5f73078ffe2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -33,6 +33,7 @@ #include "xfs_ag.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtgroup.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -519,6 +520,9 @@ xrep_rmap_scan_meta_btree( case XFS_METAFILE_RTRMAP: type = XFS_RTGI_RMAP; break; + case XFS_METAFILE_RTREFCOUNT: + type = XFS_RTGI_REFCOUNT; + break; default: ASSERT(0); return -EFSCORRUPTED; @@ -545,6 +549,9 @@ found: case XFS_METAFILE_RTRMAP: cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); + break; default: ASSERT(0); error = -EFSCORRUPTED; diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h new file mode 100644 index 000000000000..1313ef605511 --- /dev/null +++ b/fs/xfs/scrub/rtb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTB_BITMAP_H__ +#define __XFS_SCRUB_RTB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rtblock_t */ + +struct xrtb_bitmap { + struct xbitmap64 rtbitmap; +}; + +static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->rtbitmap); +} + +static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->rtbitmap); +} + +static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap, + xfs_rtblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->rtbitmap, start, len); +} + +static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->rtbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 28c90a31f4c3..e8c776a34c1d 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -105,6 +105,8 @@ xchk_rtbitmap_xref( return; xchk_xref_has_no_rt_owner(sc, rgbno, blockcount); + xchk_xref_is_not_rt_shared(sc, rgbno, blockcount); + xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount); if (rtb->next_free_rgbno < rgbno) xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c index c6e33834c5ae..203a1a97c502 100644 --- a/fs/xfs/scrub/rtbitmap_repair.c +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -23,6 +23,7 @@ #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" #include "xfs_extent_busy.h" +#include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -183,7 +184,8 @@ xrep_rtbitmap_mark_free( xfs_rgblock_t rgbno) { struct xfs_mount *mp = rtb->sc->mp; - struct xfs_rtgroup *rtg = rtb->sc->sr.rtg; + struct xchk_rt *sr = &rtb->sc->sr; + struct xfs_rtgroup *rtg = sr->rtg; xfs_rtxnum_t startrtx; xfs_rtxnum_t nextrtx; xrep_wordoff_t wordoff, nextwordoff; @@ -191,6 +193,7 @@ xrep_rtbitmap_mark_free( unsigned int bufwsize; xfs_extlen_t mod; xfs_rtword_t mask; + enum xbtree_recpacking outcome; int error; if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno)) @@ -210,6 +213,25 @@ xrep_rtbitmap_mark_free( if (mod != mp->m_sb.sb_rextsize - 1) return -EFSCORRUPTED; + /* Must not be shared or CoW staging. */ + if (sr->refc_cur) { + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_COW, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1); /* Set bits as needed to round startrtx up to the nearest word. */ diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c new file mode 100644 index 000000000000..4c5dffc73641 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_rtalloc.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/repair.h" + +/* Set us up with the realtime refcount metadata locked. */ +int +xchk_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtrefcountbt(sc); + if (error) + return error; + } + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + error = xchk_setup_rt(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg)); + if (error) + return error; + + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); +} + +/* Realtime Reference count btree scrubber. */ + +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xchk_rtrefcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xchk_rtrefcnt_check { + struct xfs_scrub *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_rgblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the rtrefcountbt says we should have. + */ +STATIC int +xchk_rtrefcountbt_rmap_check( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rtrefcnt_check *refchk = priv; + struct xchk_rtrefcnt_frag *frag; + xfs_rgblock_t rm_last; + xfs_rgblock_t rc_last; + int error = 0; + + if (xchk_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xchk_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag), + XCHK_GFP_FLAGS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the rtrefcountbt's refcount minus the + * number of extents that totally covered the rtrefcountbt extent), + * we have a rtrefcountbt error. + */ +STATIC void +xchk_rtrefcountbt_process_rmap_fragments( + struct xchk_rtrefcnt_check *refchk) +{ + struct list_head worklist; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + xfs_rgblock_t bno; + xfs_rgblock_t rbno; + xfs_rgblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLRGBLOCK; + + /* Make sure the fragments actually /are/ in bno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + nr = 0; + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) + break; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLRGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kfree(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kfree(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xchk_rtrefcountbt_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + struct xchk_rtrefcnt_check refchk = { + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = irec->rc_startblock; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check, &refchk); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + goto out_free; + + xchk_rtrefcountbt_process_rmap_fragments(&refchk); + if (irec->rc_refcount != refchk.seen) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_rtrefcountbt_xref( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock), + irec->rc_blockcount); + xchk_rtrefcountbt_xref_rmap(sc, irec); +} + +struct xchk_rtrefcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next rtgroup block where we aren't expecting shared extents. */ + xfs_rgblock_t next_unshared_rgbno; + + /* Number of CoW blocks we expect. */ + xfs_extlen_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +static inline bool +xchk_rtrefcount_mergeable( + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + XFS_REFC_LEN_MAX) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rtrefcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rtrefcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + +STATIC int +xchk_rtrefcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_rgblock_t *next_bno = priv; + + if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_rtrefcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_rtrefcbt_records *rrc, + xfs_rtblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_rgblock_t next_bno = NULLRGBLOCK; + int error; + + if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_rgbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur); +} + +/* Scrub a rtrefcountbt record. */ +STATIC int +xchk_rtrefcountbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xchk_rtrefcbt_records *rrc = bs->private; + struct xfs_refcount_irec irec; + u32 mod; + + xfs_refcount_btrec_to_irec(rec, &irec); + if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) != + NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* We can only share full rt extents. */ + mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + rrc->cow_blocks += irec.rc_blockcount; + + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + + xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec); + xchk_rtrefcountbt_xref(bs->sc, &irec); + + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_rgbno = irec.rc_startblock + + irec.rc_blockcount; + } + + return 0; +} + +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xchk_refcount_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_owner_info *btree_oinfo, + xfs_extlen_t cow_blocks) +{ + xfs_filblks_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks); + if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error)) + return; + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo, + &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (blocks != cow_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* Scrub the refcount btree for some AG. */ +int +xchk_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xfs_owner_info btree_oinfo; + struct xchk_rtrefcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_rgbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; + int error; + + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino, + XFS_DATA_FORK); + error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec, + &btree_oinfo, &rrc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the rt volume have at most one reverse mapping. + */ + xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks); + + xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the rtrefcountbt. */ +void +xchk_xref_is_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + int has_refcount; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xchk_xref_is_not_rt_shared( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, + XFS_REFC_DOMAIN_SHARED, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c new file mode 100644 index 000000000000..257cfb24beb4 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -0,0 +1,783 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_rtalloc.h" +#include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" +#include "scrub/rcbag.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * rt refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the rt refcount btree root and + * insert all the records. + */ + +struct xrep_rtrefc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xfsb_bitmap old_rtrefcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the rt refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_filblks_t btblocks; +}; + +/* Set us up to repair refcount btrees. */ +int +xrep_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_rtrefc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + xfs_rgblock_t last; + + if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL) + return -EFSCORRUPTED; + + if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0) + return -EFSCORRUPTED; + + last = rec->rc_startblock + rec->rc_blockcount - 1; + if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1) + return -EFSCORRUPTED; + + /* Make sure this isn't free space or misaligned. */ + return xrep_require_rtext_inuse(sc, rec->rc_startblock, + rec->rc_blockcount); +} + +/* Record a reference count extent. */ +STATIC int +xrep_rtrefc_stash( + struct xrep_rtrefc *rr, + enum xfs_refc_domain domain, + xfs_rgblock_t bno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = bno, + .rc_blockcount = len, + .rc_refcount = refcount, + .rc_domain = domain, + }; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); + + error = xrep_rtrefc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrefc_stash_cow( + struct xrep_rtrefc *rr, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_rtrefc_rmap_shareable( + const struct xfs_rmap_irec *rmap) +{ + /* rt metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Unwritten file blocks are not shareable. */ + if (rmap->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + + return true; +} + +/* Grab the next (abbreviated) rmap record from the rmapbt. */ +STATIC int +xrep_rtrefc_walk_rmaps( + struct xrep_rtrefc *rr, + struct xfs_rmap_irec *rmap, + bool *have_rec) +{ + struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } else if (xfs_is_sb_inum(mp, rmap->rm_owner) || + (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + } while (!xrep_rtrefc_rmap_shareable(rmap)); + + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_rtrefc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* + * Compare two refcount records. We want to sort in order of increasing block + * number. + */ +static int +xrep_rtrefc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_rtrefc_encode_startblock(ap); + sb = xrep_rtrefc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_rtrefc_sort_records( + struct xrep_rtrefc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_rgblock_t next_rgbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_rgbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_rgbno) + return -EFSCORRUPTED; + + next_rgbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +/* Record extents that belong to the realtime refcount inode. */ +STATIC int +xrep_rtrefc_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rr->sc->ip->i_ino) + return 0; + + error = xrep_check_ino_btree_mapping(rr->sc, rec); + if (error) + return error; + + return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks, + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), + rec->rm_blockcount); +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_rtrefc_push_rmaps_at( + struct xrep_rtrefc *rr, + struct rcbag *rcstack, + xfs_rgblock_t bno, + struct xfs_rmap_irec *rmap, + bool *have) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); + if (error) + return error; + + error = xrep_rtrefc_walk_rmaps(rr, rmap, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sr.rmap_cur); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Scan one AG for reverse mappings for the realtime refcount btree. */ +STATIC int +xrep_rtrefc_scan_ag( + struct xrep_rtrefc *rr, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_rtrefc_find_refcounts( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct rcbag *rcstack; + struct xfs_perag *pag = NULL; + uint64_t old_stack_height; + xfs_rgblock_t sbno; + xfs_rgblock_t cbno; + xfs_rgblock_t nbno; + bool have; + int error; + + /* Scan for old rtrefc btree blocks. */ + while ((pag = xfs_perag_next(sc->mp, pag))) { + error = xrep_rtrefc_scan_ag(rr, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + xrep_rtgroup_btcur_init(sc, &sc->sr); + + /* + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If this exceeds + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. + */ + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); + if (error) + goto out_cur; + + /* Start the rtrmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sr.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sr.rmap_cur)) { + struct xfs_rmap_irec rmap; + + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rmap.rm_startblock; + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_height = rcbag_count(rcstack); + + /* While stack isn't empty... */ + while (rcbag_count(rcstack) > 0) { + /* Pop all rmaps that end at nbno */ + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { + error = xrep_rtrefc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_height); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (rcbag_count(rcstack) == 0) + break; + old_stack_height = rcbag_count(rcstack); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(rcbag_count(rcstack) == 0); +out_bag: + rcbag_free(&rcstack); +out_cur: + xchk_rtgroup_btcur_free(&sc->sr); + return error; +} + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_rtrefc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + &cur->bc_rec.rc); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrefc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrefc_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level, + nr_this_level); +} + +/* + * Use the collected refcount information to stage a new rt refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_rtrefc_build_new_tree( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_btree_cur *refc_cur; + int error; + + error = xrep_rtrefc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the realtime refcount inode. + */ + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); + if (error) + return error; + + rr->new_btree.bload.get_records = xrep_rtrefc_get_records; + rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block; + rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size; + + refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg); + xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake); + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* + * Guess how many blocks we're going to need to rebuild an entire + * rtrefcountbt from the number of extents we found, and pump up our + * transaction to have sufficient block reservation. We're allowed + * to exceed quota to repair inconsistent metadata, though this is + * unlikely. + */ + error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg), + rr->new_btree.bload.nr_blocks, 0, true); + if (error) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* Add all observed refcount records. */ + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_cur; + + /* + * Install the new rtrefc btree in the inode. After this point the old + * btree is no longer accessible, the new tree is live, and we can + * delete the cursor. + */ + xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp); + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); + xfs_btree_del_cursor(refc_cur, 0); + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_trans(sc); +err_cur: + xfs_btree_del_cursor(refc_cur, error); + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_rtrefc_remove_old_tree( + struct xrep_rtrefc *rr) +{ + int error; + + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); + if (error) + return error; + + /* + * Ensure the proper reservation for the rtrefcount inode so that we + * don't fail to expand the btree. + */ + return xrep_reset_metafile_resv(rr->sc); +} + +/* Rebuild the rt refcount btree. */ +int +xrep_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrefc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rtrmapbt(mp)) + return -EOPNOTSUPP; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per rt extent. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_rextents, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks); + error = xrep_rtrefc_find_refcounts(rr); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the refcount information. */ + error = xrep_rtrefc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_rtrefc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c index 300a1e85b3d6..12989fe80e8b 100644 --- a/fs/xfs/scrub/rtrmap.c +++ b/fs/xfs/scrub/rtrmap.c @@ -22,6 +22,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtgroup.h" #include "xfs_metafile.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -77,6 +78,18 @@ struct xchk_rtrmap { struct xfs_rmap_irec prev_rec; }; +static inline bool +xchk_rtrmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_rtreflink(sc->mp)) + return false; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + return true; +} + /* Flag failures for records that overlap but cannot. */ STATIC void xchk_rtrmapbt_check_overlapping( @@ -98,7 +111,10 @@ xchk_rtrmapbt_check_overlapping( if (pnext <= irec->rm_startblock) goto set_prev; - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rtrmapbt_is_shareable(bs->sc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); /* Save whichever rmap record extends furthest. */ inext = irec->rm_startblock + irec->rm_blockcount; @@ -149,6 +165,37 @@ xchk_rtrmapbt_check_mergeable( memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); } +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xchk_rtrmapbt_xref_rtrefc( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + xfs_rgblock_t fbno; + xfs_extlen_t flen; + bool is_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten)) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + /* Cross-reference with other metadata. */ STATIC void xchk_rtrmapbt_xref( @@ -161,6 +208,11 @@ xchk_rtrmapbt_xref( xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock), irec->rm_blockcount); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xchk_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xchk_rtrmapbt_xref_rtrefc(sc, irec); } /* Scrub a realtime rmapbt record. */ diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index 49de8bc2dd17..f2fdd7a9fc24 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -30,6 +30,7 @@ #include "xfs_rtalloc.h" #include "xfs_ag.h" #include "xfs_rtgroup.h" +#include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -38,6 +39,7 @@ #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rgb_bitmap.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" @@ -423,6 +425,100 @@ xrep_rtrmap_scan_ag( return error; } +struct xrep_rtrmap_stash_run { + struct xrep_rtrmap *rr; + uint64_t owner; +}; + +static int +xrep_rtrmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rtrmap_stash_run *rsr = priv; + struct xrep_rtrmap *rr = rsr->rr; + xfs_rgblock_t rgbno = start; + + return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rtrmap_stash_bitmap( + struct xrep_rtrmap *rr, + struct xrgb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rtrmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + }; + + return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xrgb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xrgb_bitmap_set(bitmap, irec->rc_startblock, + irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rtrmap_find_refcount_rmaps( + struct xrep_rtrmap *rr) +{ + struct xrgb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_rtreflink(sc->mp)) + return 0; + + xrgb_bitmap_init(&cow_blocks); + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high, + xrep_rtrmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xrgb_bitmap_destroy(&cow_blocks); + return error; +} + /* Count and check all collected records. */ STATIC int xrep_rtrmap_check_record( @@ -460,6 +556,13 @@ xrep_rtrmap_find_rmaps( return error; } + /* Find CoW staging extents. */ + xrep_rtgroup_btcur_init(sc, &sc->sr); + error = xrep_rtrmap_find_refcount_rmaps(rr); + xchk_rtgroup_btcur_free(&sc->sr); + if (error) + return error; + /* * Set up for a potentially lengthy filesystem scan by reducing our * transaction resource usage for the duration. Specifically: diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 16da054b2eb0..7567dd5cad14 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -467,6 +467,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .has = xfs_has_rtrmapbt, .repair = xrep_rtrmapbt, }, + [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ + .type = ST_RTGROUP, + .setup = xchk_setup_rtrefcountbt, + .scrub = xchk_rtrefcountbt, + .has = xfs_has_rtreflink, + .repair = xrep_rtrefcountbt, + }, }; static int diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index cba4e89a3a62..a1086f1f06d0 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -129,6 +129,7 @@ struct xchk_rt { /* rtgroup btrees */ struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; }; struct xfs_scrub { @@ -284,11 +285,13 @@ int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); int xchk_rgsuperblock(struct xfs_scrub *sc); int xchk_rtrmapbt(struct xfs_scrub *sc); +int xchk_rtrefcountbt(struct xfs_scrub *sc); #else # define xchk_rtbitmap xchk_nothing # define xchk_rtsummary xchk_nothing # define xchk_rgsuperblock xchk_nothing # define xchk_rtrmapbt xchk_nothing +# define xchk_rtrefcountbt xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); @@ -328,11 +331,20 @@ void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, xfs_extlen_t len); void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); +void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); #else # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) # define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0) # define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0) # define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0) +# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0) #endif #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index eb6bb170c902..f8a37ea97791 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -83,6 +83,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_METAPATH] = "metapath", [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper", [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt", + [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index fb86b746bc17..d7c4ced47c15 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -77,6 +77,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -111,7 +112,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \ { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \ { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \ - { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" } + { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ + { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -1962,32 +1964,36 @@ DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup); #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(pag, agbno, len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = pag_mount(pag)->m_super->s_dev; - __entry->agno = pag_agno(pag); + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len) ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ xfs_extlen_t len), \ - TP_ARGS(pag, agbno, len)) + TP_ARGS(xg, agbno, len)) DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); @@ -1995,35 +2001,39 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), - TP_ARGS(pag, agbno, len, crosslinked), + TP_ARGS(xg, agbno, len, crosslinked), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) __field(bool, crosslinked) ), TP_fast_assign( - __entry->dev = pag_mount(pag)->m_super->s_dev; - __entry->agno = pag_agno(pag); + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->crosslinked = crosslinked; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->crosslinked ? 1 : 0) ); #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ DEFINE_EVENT(xrep_reap_find_class, name, \ - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ xfs_extlen_t len, bool crosslinked), \ - TP_ARGS(pag, agbno, len, crosslinked)) + TP_ARGS(xg, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); @@ -2114,29 +2124,33 @@ TRACE_EVENT(xrep_ibt_found, ) TRACE_EVENT(xrep_refc_found, - TP_PROTO(const struct xfs_perag *pag, + TP_PROTO(const struct xfs_group *xg, const struct xfs_refcount_irec *rec), - TP_ARGS(pag, rec), + TP_ARGS(xg, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) + __field(enum xfs_group_type, type) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = pag_mount(pag)->m_super->s_dev; - __entry->agno = pag_agno(pag); + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->agno = xg->xg_gno; + __entry->type = xg->xg_type; __entry->domain = rec->rc_domain; __entry->startblock = rec->rc_startblock; __entry->blockcount = rec->rc_blockcount; __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 4f2e4ea29e1f..b05d5b81f642 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -271,6 +271,9 @@ xlog_recover_validate_buf_type( case XFS_REFC_CRC_MAGIC: bp->b_ops = &xfs_refcountbt_buf_ops; break; + case XFS_RTREFC_CRC_MAGIC: + bp->b_ops = &xfs_rtrefcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -859,6 +862,7 @@ xlog_recover_get_buf_lsn( break; } case XFS_RTRMAP_CRC_MAGIC: + case XFS_RTREFC_CRC_MAGIC: case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: { struct xfs_btree_block *btb = blk; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3e3ef16f65a3..1dbd2d75f7ae 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -27,6 +27,7 @@ #include "xfs_ag.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* Convert an xfs_fsmap to an fsmap. */ static void @@ -212,21 +213,20 @@ xfs_getfsmap_is_shared( struct xfs_mount *mp = tp->t_mountp; struct xfs_btree_cur *cur; xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_extlen_t flen = 0; int error; *stat = false; - if (!xfs_has_reflink(mp)) - return 0; - /* rt files will have no perag structure */ - if (!info->group) + if (!xfs_has_reflink(mp) || !info->group) return 0; - /* Are there any shared blocks here? */ - flen = 0; - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, - to_perag(info->group)); + if (info->group->xg_type == XG_TYPE_RTG) + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group)); + else + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + to_perag(info->group)); + /* Are there any shared blocks here? */ error = xfs_refcount_find_shared(cur, frec->rec_key, XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen, false); @@ -863,7 +863,7 @@ xfs_getfsmap_rtdev_rmapbt_query( struct xfs_rtgroup *rtg = to_rtg(info->group); /* Query the rtrmapbt */ - xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); *curpp = xfs_rtrmapbt_init_cursor(tp, rtg); return xfs_rmap_query_range(*curpp, &info->low, &info->high, xfs_getfsmap_rtdev_rmapbt_helper, info); @@ -950,7 +950,8 @@ xfs_getfsmap_rtdev_rmapbt( if (bt_cur) { xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), - XFS_RTGLOCK_RMAP); + XFS_RTGLOCK_RMAP | + XFS_RTGLOCK_REFCOUNT); xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); bt_cur = NULL; } @@ -988,7 +989,7 @@ xfs_getfsmap_rtdev_rmapbt( if (bt_cur) { xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), - XFS_RTGLOCK_RMAP); + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9df5a09c0acd..455298503d01 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -23,6 +23,7 @@ #include "xfs_trace.h" #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -231,6 +232,7 @@ xfs_growfs_data_private( /* Compute new maxlevels for rt btrees. */ xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); } return error; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index d438c3c001c8..7c541fb373d5 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -448,6 +448,7 @@ static const struct ioctl_sick_map rtgroup_map[] = { { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY }, { XFS_SICK_RG_RMAPBT, XFS_RTGROUP_GEOM_SICK_RMAPBT }, + { XFS_SICK_RG_REFCNTBT, XFS_RTGROUP_GEOM_SICK_REFCNTBT }, }; /* Fill out rtgroup geometry health info. */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a174f64b8bb2..70283c6419fd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -157,6 +157,20 @@ xfs_inode_item_precommit( if (flags & XFS_ILOG_IVERSION) flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* + * Inode verifiers do not check that the CoW extent size hint is an + * integer multiple of the rt extent size on a directory with both + * rtinherit and cowextsize flags set. If we're logging a directory + * that is misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } + if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5de1d3563b76..f3bfb814378c 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -23,6 +23,7 @@ #include "xfs_icache.h" #include "xfs_bmap_btree.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" STATIC void xlog_recover_inode_ra_pass2( @@ -286,6 +287,9 @@ xlog_recover_inode_dbroot( case XFS_METAFILE_RTRMAP: xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize); return 0; + case XFS_METAFILE_RTREFCOUNT: + xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize); + return 0; default: ASSERT(0); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0789c18aaa18..726282e74d54 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -469,8 +469,21 @@ xfs_fill_fsxattr( } } - if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) - fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + /* + * Don't let a misaligned CoW extent size hint on a directory + * escape to userspace if it won't pass the setattr checks + * later. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) { + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + fa->fsx_cowextsize = 0; + } else { + fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); + } + } + fa->fsx_projid = ip->i_projid; if (ifp && !xfs_need_iread_extents(ifp)) fa->fsx_nextents = xfs_iext_count(ifp); @@ -541,10 +554,6 @@ xfs_ioctl_setattr_xflags( if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - - /* Clear reflink if we are actually able to set the rt flag. */ - if (xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; } /* diflags2 only valid for v3 inodes. */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5c95c97519c7..b3c27dbccce8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1822,6 +1822,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_rtefd_item_ops, &xlog_rtrui_item_ops, &xlog_rtrud_item_ops, + &xlog_rtcui_item_ops, + &xlog_rtcud_item_ops, }; static const struct xlog_recover_item_ops * diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7b7d21b50d54..477c5262cf91 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -38,6 +38,7 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -656,7 +657,8 @@ static inline void xfs_rtbtree_compute_maxlevels( struct xfs_mount *mp) { - mp->m_rtbtree_maxlevels = mp->m_rtrmap_maxlevels; + mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels, + mp->m_rtrefc_maxlevels); } /* @@ -729,6 +731,7 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_rtrmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); xfs_agbtree_compute_maxlevels(mp); xfs_rtbtree_compute_maxlevels(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1bc95fb170db..9a1516080e63 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -162,11 +162,14 @@ typedef struct xfs_mount { uint m_rtrmap_mnr[2]; /* min rtrmap btree records */ uint m_refc_mxr[2]; /* max refc btree records */ uint m_refc_mnr[2]; /* min refc btree records */ + uint m_rtrefc_mxr[2]; /* max rtrefc btree records */ + uint m_rtrefc_mnr[2]; /* min rtrefc btree records */ uint m_alloc_maxlevels; /* max alloc btree levels */ uint m_bm_maxlevels[2]; /* max bmap btree levels */ uint m_rmap_maxlevels; /* max rmap btree levels */ uint m_rtrmap_maxlevels; /* max rtrmap btree level */ uint m_refc_maxlevels; /* max refcount btree level */ + uint m_rtrefc_maxlevels; /* max rtrefc btree level */ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ @@ -408,6 +411,12 @@ static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp) xfs_has_rmapbt(mp); } +static inline bool xfs_has_rtreflink(struct xfs_mount *mp) +{ + return xfs_has_metadir(mp) && xfs_has_realtime(mp) && + xfs_has_reflink(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index bede1c96c330..fe2d7aab8554 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -23,6 +23,7 @@ #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_rtgroup.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -94,8 +95,9 @@ xfs_cui_item_format( ASSERT(atomic_read(&cuip->cui_next_extent) == cuip->cui_format.cui_nextents); + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); - cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_type = lip->li_type; cuip->cui_format.cui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, @@ -138,12 +140,14 @@ xfs_cui_item_release( STATIC struct xfs_cui_log_item * xfs_cui_init( struct xfs_mount *mp, + unsigned short item_type, uint nextents) - { struct xfs_cui_log_item *cuip; ASSERT(nextents > 0); + ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) cuip = kzalloc(xfs_cui_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); @@ -151,7 +155,7 @@ xfs_cui_init( cuip = kmem_cache_zalloc(xfs_cui_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; atomic_set(&cuip->cui_next_extent, 0); @@ -190,7 +194,9 @@ xfs_cud_item_format( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); struct xfs_log_iovec *vecp = NULL; - cudp->cud_format.cud_type = XFS_LI_CUD; + ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT); + + cudp->cud_format.cud_type = lip->li_type; cudp->cud_format.cud_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, @@ -234,6 +240,14 @@ static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e) return list_entry(e, struct xfs_refcount_intent, ri_list); } +static inline bool +xfs_cui_item_isrt(const struct xfs_log_item *lip) +{ + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); + + return lip->li_type == XFS_LI_CUI_RT; +} + /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -282,18 +296,20 @@ xfs_refcount_update_log_item( } static struct xfs_log_item * -xfs_refcount_update_create_intent( +__xfs_refcount_update_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, - bool sort) + bool sort, + unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_cui_log_item *cuip; struct xfs_refcount_intent *ri; ASSERT(count > 0); + cuip = xfs_cui_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -301,6 +317,23 @@ xfs_refcount_update_create_intent( return &cuip->cui_item; } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_refcount_update_create_intent(tp, items, count, sort, + XFS_LI_CUI); +} + +static inline unsigned short +xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip) +{ + return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD; +} + /* Get an CUD so we can process all the deferred refcount updates. */ static struct xfs_log_item * xfs_refcount_update_create_done( @@ -312,8 +345,8 @@ xfs_refcount_update_create_done( struct xfs_cud_log_item *cudp; cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, + xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops); cudp->cud_cuip = cuip; cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; @@ -328,10 +361,20 @@ xfs_refcount_defer_add( { struct xfs_mount *mp = tp->t_mountp; - trace_xfs_refcount_defer(mp, ri); + /* + * Deferred refcount updates for the realtime and data sections must + * use separate transactions to finish deferred work because updates to + * realtime metadata files can lock AGFs to allocate btree blocks and + * we don't want that mixing with the AGF locks taken to finish data + * section updates. + */ + ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, + ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); - ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG); - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); + trace_xfs_refcount_defer(mp, ri); + xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? + &xfs_rtrefcount_update_defer_type : + &xfs_refcount_update_defer_type); } /* Cancel a deferred refcount update. */ @@ -381,7 +424,7 @@ xfs_refcount_finish_one_cleanup( return; agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); - if (error) + if (error && agbp) xfs_trans_brelse(tp, agbp); } @@ -397,6 +440,7 @@ xfs_refcount_update_abort_intent( static inline bool xfs_cui_validate_phys( struct xfs_mount *mp, + bool isrt, struct xfs_phys_extent *pmap) { if (!xfs_has_reflink(mp)) @@ -415,6 +459,9 @@ xfs_cui_validate_phys( return false; } + if (isrt) + return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len); + return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } @@ -422,6 +469,7 @@ static inline void xfs_cui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + bool isrt, struct xfs_phys_extent *pmap) { struct xfs_refcount_intent *ri; @@ -432,7 +480,8 @@ xfs_cui_recover_work( ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock, - XG_TYPE_AG); + isrt ? XG_TYPE_RTG : XG_TYPE_AG); + ri->ri_realtime = isrt; xfs_defer_add_item(dfp, &ri->ri_list); } @@ -451,6 +500,7 @@ xfs_refcount_recover_work( struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_trans *tp; struct xfs_mount *mp = lip->li_log->l_mp; + bool isrt = xfs_cui_item_isrt(lip); int i; int error = 0; @@ -460,7 +510,7 @@ xfs_refcount_recover_work( * just toss the CUI. */ for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - if (!xfs_cui_validate_phys(mp, + if (!xfs_cui_validate_phys(mp, isrt, &cuip->cui_format.cui_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, @@ -468,7 +518,8 @@ xfs_refcount_recover_work( return -EFSCORRUPTED; } - xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); + xfs_cui_recover_work(mp, dfp, isrt, + &cuip->cui_format.cui_extents[i]); } /* @@ -515,10 +566,13 @@ xfs_refcount_relog_intent( struct xfs_phys_extent *pmap; unsigned int count; + ASSERT(intent->li_type == XFS_LI_CUI || + intent->li_type == XFS_LI_CUI_RT); + count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - cuip = xfs_cui_init(tp->t_mountp, count); + cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); @@ -538,6 +592,71 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .relog_intent = xfs_refcount_relog_intent, }; +#ifdef CONFIG_XFS_RT +static struct xfs_log_item * +xfs_rtrefcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_refcount_update_create_intent(tp, items, count, sort, + XFS_LI_CUI_RT); +} + +/* Process a deferred realtime refcount update. */ +STATIC int +xfs_rtrefcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_refcount_intent *ri = ci_entry(item); + int error; + + error = xfs_rtrefcount_finish_one(tp, ri, state); + + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && ri->ri_blockcount > 0) { + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || + ri->ri_type == XFS_REFCOUNT_DECREASE); + return -EAGAIN; + } + + xfs_refcount_update_cancel_item(item); + return error; +} + +/* Clean up after calling xfs_rtrefcount_finish_one. */ +STATIC void +xfs_rtrefcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + if (rcur) + xfs_btree_del_cursor(rcur, error); +} + +const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { + .name = "rtrefcount", + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rtrefcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_rtrefcount_update_finish_item, + .finish_cleanup = xfs_rtrefcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; +#else +const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { + .name = "rtrefcount", +}; +#endif /* CONFIG_XFS_RT */ + STATIC bool xfs_cui_item_match( struct xfs_log_item *lip, @@ -603,7 +722,7 @@ xlog_recover_cui_commit_pass2( return -EFSCORRUPTED; } - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); @@ -617,6 +736,61 @@ const struct xlog_recover_item_ops xlog_cui_item_ops = { .commit_pass2 = xlog_recover_cui_commit_pass2, }; +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtcui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + size_t len; + + cui_formatp = item->ri_buf[0].i_addr; + + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + &xfs_rtrefcount_update_defer_type); + return 0; +} +#else +STATIC int +xlog_recover_rtcui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; +} +#endif + +const struct xlog_recover_item_ops xlog_rtcui_item_ops = { + .item_type = XFS_LI_CUI_RT, + .commit_pass2 = xlog_recover_rtcui_commit_pass2, +}; + /* * This routine is called when an CUD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding CUI if it @@ -648,3 +822,33 @@ const struct xlog_recover_item_ops xlog_cud_item_ops = { .item_type = XFS_LI_CUD, .commit_pass2 = xlog_recover_cud_commit_pass2, }; + +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtcud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_format *cud_formatp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_CUI_RT, + cud_formatp->cud_cui_id); + return 0; +} +#else +# define xlog_recover_rtcud_commit_pass2 xlog_recover_rtcui_commit_pass2 +#endif + +const struct xlog_recover_item_ops xlog_rtcud_item_ops = { + .item_type = XFS_LI_CUD_RT, + .commit_pass2 = xlog_recover_rtcud_commit_pass2, +}; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index b11769c009ef..59f7fc16eb80 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -30,6 +30,10 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" /* * Copy on Write of Shared Blocks @@ -120,38 +124,93 @@ */ /* - * Given an AG extent, find the lowest-numbered run of shared blocks - * within that range and return the range in fbno/flen. If - * find_end_of_shared is true, return the longest contiguous extent of - * shared blocks. If there are no shared extents, fbno and flen will - * be set to NULLAGBLOCK and 0, respectively. + * Given a file mapping for the data device, find the lowest-numbered run of + * shared blocks within that mapping and return it in shared_offset/shared_len. + * The offset is relative to the start of irec. + * + * If find_end_of_shared is true, return the longest contiguous extent of shared + * blocks. If there are no shared extents, shared_offset and shared_len will be + * set to 0; */ static int xfs_reflink_find_shared( - struct xfs_perag *pag, + struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agblock_t agbno, - xfs_extlen_t aglen, - xfs_agblock_t *fbno, - xfs_extlen_t *flen, + const struct xfs_bmbt_irec *irec, + xfs_extlen_t *shared_offset, + xfs_extlen_t *shared_len, bool find_end_of_shared) { struct xfs_buf *agbp; + struct xfs_perag *pag; struct xfs_btree_cur *cur; int error; + xfs_agblock_t orig_bno, found_bno; + + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) - return error; + goto out; - cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, + &found_bno, shared_len, find_end_of_shared); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(tp, agbp); - error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, - find_end_of_shared); + if (!error && *shared_len) + *shared_offset = found_bno - orig_bno; +out: + xfs_perag_put(pag); + return error; +} +/* + * Given a file mapping for the rt device, find the lowest-numbered run of + * shared blocks within that mapping and return it in shared_offset/shared_len. + * The offset is relative to the start of irec. + * + * If find_end_of_shared is true, return the longest contiguous extent of shared + * blocks. If there are no shared extents, shared_offset and shared_len will be + * set to 0; + */ +static int +xfs_reflink_find_rtshared( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_bmbt_irec *irec, + xfs_extlen_t *shared_offset, + xfs_extlen_t *shared_len, + bool find_end_of_shared) +{ + struct xfs_rtgroup *rtg; + struct xfs_btree_cur *cur; + xfs_rgblock_t orig_bno; + xfs_agblock_t found_bno; + int error; + + BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK); + + /* + * Note: this uses the not quite correct xfs_agblock_t type because + * xfs_refcount_find_shared is shared between the RT and data device + * refcount code. + */ + orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock); + rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock)); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, rtg); + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, + &found_bno, shared_len, find_end_of_shared); xfs_btree_del_cursor(cur, error); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_put(rtg); - xfs_trans_brelse(tp, agbp); + if (!error && *shared_len) + *shared_offset = found_bno - orig_bno; return error; } @@ -172,11 +231,7 @@ xfs_reflink_trim_around_shared( bool *shared) { struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_extlen_t shared_offset, shared_len; int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ @@ -187,41 +242,37 @@ xfs_reflink_trim_around_shared( trace_xfs_reflink_trim_around_shared(ip, irec); - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); - aglen = irec->br_blockcount; - - error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, - true); - xfs_perag_put(pag); + if (XFS_IS_REALTIME_INODE(ip)) + error = xfs_reflink_find_rtshared(mp, NULL, irec, + &shared_offset, &shared_len, true); + else + error = xfs_reflink_find_shared(mp, NULL, irec, + &shared_offset, &shared_len, true); if (error) return error; - *shared = false; - if (fbno == NULLAGBLOCK) { + if (!shared_len) { /* No shared blocks at all. */ - return 0; - } - - if (fbno == agbno) { + *shared = false; + } else if (!shared_offset) { /* - * The start of this extent is shared. Truncate the - * mapping at the end of the shared region so that a - * subsequent iteration starts at the start of the - * unshared region. + * The start of this mapping points to shared space. Truncate + * the mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the unshared + * region. */ - irec->br_blockcount = flen; + irec->br_blockcount = shared_len; *shared = true; - return 0; + } else { + /* + * There's a shared region that doesn't start at the beginning + * of the mapping. Truncate the mapping at the start of the + * shared extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = shared_offset; + *shared = false; } - - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; return 0; } @@ -389,20 +440,26 @@ xfs_reflink_fill_cow_hole( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_filblks_t resaligned; - xfs_extlen_t resblks; + unsigned int dblocks = 0, rblocks = 0; int nimaps; int error; bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } xfs_iunlock(ip, *lockmode); *lockmode = 0; - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, - false, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); if (error) return error; @@ -571,6 +628,7 @@ xfs_reflink_cancel_cow_blocks( struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + bool isrt = XFS_IS_REALTIME_INODE(ip); int error = 0; if (!xfs_inode_has_cow_data(ip)) @@ -598,12 +656,13 @@ xfs_reflink_cancel_cow_blocks( ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); /* Free the CoW orphan record. */ - xfs_refcount_free_cow_extent(*tpp, del.br_startblock, - del.br_blockcount); + xfs_refcount_free_cow_extent(*tpp, isrt, + del.br_startblock, del.br_blockcount); error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE, 0); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) break; @@ -687,6 +746,35 @@ out: return error; } +#ifdef CONFIG_XFS_QUOTA +/* + * Update quota accounting for a remapping operation. When we're remapping + * something from the CoW fork to the data fork, we must update the quota + * accounting for delayed allocations. For remapping from the data fork to the + * data fork, use regular block accounting. + */ +static inline void +xfs_reflink_update_quota( + struct xfs_trans *tp, + struct xfs_inode *ip, + bool is_cow, + int64_t blocks) +{ + unsigned int qflag; + + if (XFS_IS_REALTIME_INODE(ip)) { + qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT; + } else { + qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT; + } + xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks); +} +#else +# define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0) +#endif + /* * Remap part of the CoW fork into the data fork. * @@ -710,6 +798,7 @@ xfs_reflink_end_cow_extent( struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); unsigned int resblks; int nmaps; + bool isrt = XFS_IS_REALTIME_INODE(ip); int error; resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); @@ -779,9 +868,8 @@ xfs_reflink_end_cow_extent( * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); - xfs_refcount_decrease_extent(tp, &data); - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - -data.br_blockcount); + xfs_refcount_decrease_extent(tp, isrt, &data); + xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount); } else if (data.br_startblock == DELAYSTARTBLOCK) { int done; @@ -799,14 +887,14 @@ xfs_reflink_end_cow_extent( } /* Free the CoW orphan record. */ - xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); + xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock, + del.br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); /* Charge this new data fork mapping to the on-disk quota. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, - (long)del.br_blockcount); + xfs_reflink_update_quota(tp, ip, true, del.br_blockcount); /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); @@ -895,20 +983,29 @@ xfs_reflink_recover_cow( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; int error = 0; if (!xfs_has_reflink(mp)) return 0; while ((pag = xfs_perag_next(mp, pag))) { - error = xfs_refcount_recover_cow_leftovers(mp, pag); + error = xfs_refcount_recover_cow_leftovers(pag_group(pag)); if (error) { xfs_perag_rele(pag); - break; + return error; } } - return error; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg)); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + return 0; } /* @@ -1100,14 +1197,28 @@ out_error: static int xfs_reflink_ag_has_free_space( struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_inode *ip, + xfs_fsblock_t fsb) { struct xfs_perag *pag; + xfs_agnumber_t agno; int error = 0; if (!xfs_has_rmapbt(mp)) return 0; + if (XFS_IS_REALTIME_INODE(ip)) { + struct xfs_rtgroup *rtg; + xfs_rgnumber_t rgno; + + rgno = xfs_rtb_to_rgno(mp, fsb); + rtg = xfs_rtgroup_get(mp, rgno); + if (xfs_metafile_resv_critical(rtg_rmap(rtg))) + error = -ENOSPC; + xfs_rtgroup_put(rtg); + return error; + } + agno = XFS_FSB_TO_AGNO(mp, fsb); pag = xfs_perag_get(mp, agno); if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) @@ -1131,10 +1242,11 @@ xfs_reflink_remap_extent( struct xfs_trans *tp; xfs_off_t newlen; int64_t qdelta = 0; - unsigned int resblks; + unsigned int dblocks, rblocks, resblks; bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); + bool isrt = XFS_IS_REALTIME_INODE(ip); int iext_delta = 0; int nimaps; int error; @@ -1161,8 +1273,15 @@ xfs_reflink_remap_extent( * we're remapping. */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = resblks; + rblocks = dmap->br_blockcount; + } else { + dblocks = resblks + dmap->br_blockcount; + rblocks = 0; + } error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, - resblks + dmap->br_blockcount, 0, false, &tp); + dblocks, rblocks, false, &tp); if (error == -EDQUOT || error == -ENOSPC) { quota_reserved = false; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, @@ -1213,8 +1332,8 @@ xfs_reflink_remap_extent( /* No reflinking if the AG of the dest mapping is low on space. */ if (dmap_written) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); + error = xfs_reflink_ag_has_free_space(mp, ip, + dmap->br_startblock); if (error) goto out_cancel; } @@ -1242,8 +1361,15 @@ xfs_reflink_remap_extent( * done. */ if (!quota_reserved && !smap_real && dmap_written) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - dmap->br_blockcount, 0, false); + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = 0; + rblocks = dmap->br_blockcount; + } else { + dblocks = dmap->br_blockcount; + rblocks = 0; + } + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + false); if (error) goto out_cancel; } @@ -1264,7 +1390,7 @@ xfs_reflink_remap_extent( * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); - xfs_refcount_decrease_extent(tp, &smap); + xfs_refcount_decrease_extent(tp, isrt, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { int done; @@ -1287,12 +1413,12 @@ xfs_reflink_remap_extent( * its refcount and map it into the file. */ if (dmap_written) { - xfs_refcount_increase_extent(tp, dmap); + xfs_refcount_increase_extent(tp, isrt, dmap); xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); qdelta += dmap->br_blockcount; } - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); + xfs_reflink_update_quota(tp, ip, false, qdelta); /* Update dest isize if needed. */ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); @@ -1466,8 +1592,8 @@ xfs_reflink_remap_prep( /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - /* Don't reflink realtime inodes */ - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + /* Can't reflink between data and rt volumes */ + if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest)) goto out_unlock; /* Don't share DAX file data with non-DAX file. */ @@ -1547,27 +1673,23 @@ xfs_reflink_inode_has_shared_extents( *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { - struct xfs_perag *pag; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; + xfs_extlen_t shared_offset, shared_len; if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); - agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); - aglen = got.br_blockcount; - error = xfs_reflink_find_shared(pag, tp, agbno, aglen, - &rbno, &rlen, false); - xfs_perag_put(pag); + if (XFS_IS_REALTIME_INODE(ip)) + error = xfs_reflink_find_rtshared(mp, tp, &got, + &shared_offset, &shared_len, false); + else + error = xfs_reflink_find_shared(mp, tp, &got, + &shared_offset, &shared_len, false); if (error) return error; /* Is there still a shared block here? */ - if (rbno != NULLAGBLOCK) { + if (shared_len) { *has_shared = true; return 0; } @@ -1700,3 +1822,28 @@ out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } + +/* + * Can we use reflink with this realtime extent size? Note that we don't check + * for rblocks > 0 here because this can be called as part of attaching a new + * rt section. + */ +bool +xfs_reflink_supports_rextsize( + struct xfs_mount *mp, + unsigned int rextsize) +{ + /* reflink on the realtime device requires rtgroups */ + if (!xfs_has_rtgroups(mp)) + return false; + + /* + * Reflink doesn't support rt extent size larger than a single fsblock + * because we would have to perform CoW-around for unaligned write + * requests to guarantee that we always remap entire rt extents. + */ + if (rextsize != 1) + return false; + + return true; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 4a58e4533671..cc4e92278279 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -25,7 +25,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) return true; } -extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, +int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); @@ -62,4 +62,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); +bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); + #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a69967f9d88e..d8e6d073d64d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -31,6 +31,8 @@ #include "xfs_rtgroup.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_reflink.h" /* * Return whether there are any free extents in the size range given @@ -593,7 +595,7 @@ xfs_rtalloc_sumlevel( * specified. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int +static int xfs_rtallocate_extent_size( struct xfs_rtalloc_args *args, xfs_rtxlen_t minlen, /* minimum length to allocate */ @@ -994,6 +996,7 @@ xfs_growfs_rt_bmblock( */ mp->m_features |= XFS_FEAT_REALTIME; xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); kfree(nmp); return 0; @@ -1177,6 +1180,7 @@ xfs_growfs_check_rtgeom( nmp->m_sb.sb_dblocks = dblocks; xfs_rtrmapbt_compute_maxlevels(nmp); + xfs_rtrefcountbt_compute_maxlevels(nmp); xfs_trans_resv_calc(nmp, M_RES(nmp)); /* @@ -1289,8 +1293,10 @@ xfs_growfs_rt( goto out_unlock; if (xfs_has_quota(mp)) goto out_unlock; - } - if (xfs_has_reflink(mp)) + if (xfs_has_reflink(mp)) + goto out_unlock; + } else if (xfs_has_reflink(mp) && + !xfs_reflink_supports_rextsize(mp, in->extsize)) goto out_unlock; error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); @@ -1547,6 +1553,11 @@ xfs_rt_resv_init( err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); if (err2 && !error) error = err2; + + ask = xfs_rtrefcountbt_calc_reserves(mp); + err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); + if (err2 && !error) + error = err2; } return error; @@ -1950,7 +1961,7 @@ out_unlock: goto out_release; } -static int +int xfs_rtallocate_rtgs( struct xfs_trans *tp, xfs_fsblock_t bno_hint, @@ -2015,7 +2026,10 @@ xfs_rtallocate_align( if (*noalign) { align = mp->m_sb.sb_rextsize; } else { - align = xfs_get_extsz_hint(ap->ip); + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else + align = xfs_get_extsz_hint(ap->ip); if (!align) align = 1; if (align == mp->m_sb.sb_rextsize) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 9044f7226ab6..0d95b29092c9 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -77,4 +77,9 @@ xfs_growfs_check_rtgeom(const struct xfs_mount *mp, } #endif /* CONFIG_XFS_RT */ +int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint, + xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod, + bool wasdel, bool initial_user_data, xfs_rtblock_t *bno, + xfs_extlen_t *blen); + #endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index b7f2988bc03b..35c7fb3ba324 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -54,7 +54,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) }, { "rcbagbt", xfsstats_offset(xs_rtrmap_2) }, { "rtrmapbt", xfsstats_offset(xs_rtrmap_mem_2)}, - { "rtrmapbt_mem", xfsstats_offset(xs_qm_dqreclaims)}, + { "rtrmapbt_mem", xfsstats_offset(xs_rtrefcbt_2) }, + { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 9c47de5dff2d..15ba1abcf253 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -129,6 +129,7 @@ struct __xfsstats { uint32_t xs_rcbag_2[__XBTS_MAX]; uint32_t xs_rtrmap_2[__XBTS_MAX]; uint32_t xs_rtrmap_mem_2[__XBTS_MAX]; + uint32_t xs_rtrefcbt_2[__XBTS_MAX]; uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ecd5a9f444d8..7c3f996cd39e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1754,9 +1754,11 @@ xfs_fs_fill_super( xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); if (xfs_has_reflink(mp)) { - if (mp->m_sb.sb_rblocks) { + if (xfs_has_realtime(mp) && + !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { xfs_alert(mp, - "reflink not compatible with realtime device!"); + "reflink not compatible with realtime extent size %u!", + mp->m_sb.sb_rextsize); error = -EINVAL; goto out_filestream_unmount; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 84cdc145e2d9..4fe689410eb6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3305,56 +3305,62 @@ TRACE_EVENT(xfs_ag_resv_init_error, /* refcount tracepoint classes */ DECLARE_EVENT_CLASS(xfs_refcount_class, - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, xfs_extlen_t len), - TP_ARGS(cur, agbno, len), + TP_ARGS(cur, gbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; - __entry->agbno = agbno; + __entry->gbno = gbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len) ); #define DEFINE_REFCOUNT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_class, name, \ - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \ + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \ xfs_extlen_t len), \ - TP_ARGS(cur, agbno, len)) + TP_ARGS(cur, gbno, len)) TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); TRACE_EVENT(xfs_refcount_lookup, - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, xfs_lookup_t dir), - TP_ARGS(cur, agbno, dir), + TP_ARGS(cur, gbno, dir), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_lookup_t, dir) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; - __entry->agbno = agbno; + __entry->gbno = gbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), __entry->dir) ) @@ -3365,6 +3371,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_ARGS(cur, irec), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) @@ -3373,14 +3380,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, @@ -3396,49 +3405,53 @@ DEFINE_EVENT(xfs_refcount_extent_class, name, \ /* single-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, - xfs_agblock_t agbno), - TP_ARGS(cur, irec, agbno), + xfs_agblock_t gbno), + TP_ARGS(cur, irec, gbno), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; - __entry->agbno = agbno; + __entry->gbno = gbno; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, - __entry->agbno) + __entry->gbno) ) #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \ - xfs_agblock_t agbno), \ - TP_ARGS(cur, irec, agbno)) + xfs_agblock_t gbno), \ + TP_ARGS(cur, irec, gbno)) /* double-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, - struct xfs_refcount_irec *i2), + struct xfs_refcount_irec *i2), TP_ARGS(cur, i1, i2), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3451,6 +3464,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; @@ -3461,9 +3475,10 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3484,10 +3499,11 @@ DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ /* double-rcext and an agbno tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), - TP_ARGS(cur, i1, i2, agbno), + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), + TP_ARGS(cur, i1, i2, gbno), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3497,10 +3513,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; @@ -3510,11 +3527,12 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; - __entry->agbno = agbno; + __entry->gbno = gbno; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3524,14 +3542,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, - __entry->agbno) + __entry->gbno) ) #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \ - TP_ARGS(cur, i1, i2, agbno)) + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \ + TP_ARGS(cur, i1, i2, gbno)) /* triple-rcext tracepoint class */ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, @@ -3540,6 +3558,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_ARGS(cur, i1, i2, i3), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) @@ -3556,6 +3575,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->type = cur->bc_group->xg_type; __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; @@ -3570,10 +3590,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, @@ -3641,23 +3662,27 @@ DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, TP_ARGS(mp, refc), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(int, op) - __field(xfs_agblock_t, agbno) + __field(xfs_agblock_t, gbno) __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock); + __entry->type = refc->ri_group->xg_type; + __entry->agno = refc->ri_group->xg_gno; __entry->op = refc->ri_type; - __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock); + __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock, + refc->ri_group->xg_type); __entry->len = refc->ri_blockcount; ), - TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, + __entry->gbno, __entry->len) ); #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ |