diff options
Diffstat (limited to 'fs')
204 files changed, 2898 insertions, 1858 deletions
| diff --git a/fs/Kconfig b/fs/Kconfig index c718b2e2de0e..5b4847bd2fbb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -368,6 +368,7 @@ config GRACE_PERIOD  config LOCKD  	tristate  	depends on FILE_LOCKING +	select CRC32  	select GRACE_PERIOD  config LOCKD_V4 diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 691e0ae607a1..8c6130789fde 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -348,9 +348,9 @@ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)  	}  	if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { -		rcu_read_lock(); +		down_read(&net->cells_lock);  		ret = afs_dynroot_readdir_cells(net, ctx); -		rcu_read_unlock(); +		up_read(&net->cells_lock);  	}  	return ret;  } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index bf1c94e51dd0..07709b0d7688 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -4,7 +4,7 @@ config BCACHEFS_FS  	depends on BLOCK  	select EXPORTFS  	select CLOSURES -	select LIBCRC32C +	select CRC32  	select CRC64  	select FS_POSIX_ACL  	select LZ4_COMPRESS @@ -15,10 +15,9 @@ config BCACHEFS_FS  	select ZLIB_INFLATE  	select ZSTD_COMPRESS  	select ZSTD_DECOMPRESS -	select CRYPTO  	select CRYPTO_LIB_SHA256 -	select CRYPTO_CHACHA20 -	select CRYPTO_POLY1305 +	select CRYPTO_LIB_CHACHA +	select CRYPTO_LIB_POLY1305  	select KEYS  	select RAID6_PQ  	select XOR_BLOCKS diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7c930ef77380..7ec022e9361a 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1422,8 +1422,30 @@ alloc_done:  	wp->sectors_free = UINT_MAX; -	open_bucket_for_each(c, &wp->ptrs, ob, i) +	open_bucket_for_each(c, &wp->ptrs, ob, i) { +		/* +		 * Ensure proper write alignment - either due to misaligned +		 * bucket sizes (from buggy bcachefs-tools), or writes that mix +		 * logical/physical alignment: +		 */ +		struct bch_dev *ca = ob_dev(c, ob); +		u64 offset = bucket_to_sector(ca, ob->bucket) + +			ca->mi.bucket_size - +			ob->sectors_free; +		unsigned align = round_up(offset, block_sectors(c)) - offset; + +		ob->sectors_free = max_t(int, 0, ob->sectors_free - align); +  		wp->sectors_free = min(wp->sectors_free, ob->sectors_free); +	} + +	wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + +	/* Did alignment use up space in an open_bucket? */ +	if (unlikely(!wp->sectors_free)) { +		bch2_alloc_sectors_done(c, wp); +		goto retry; +	}  	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 69ec6a012898..4c1e33cf57c0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -110,7 +110,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ  	unsigned i;  	open_bucket_for_each(c, &wp->ptrs, ob, i) -		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); +		ob_push(c, ob->sectors_free < block_sectors(c) +			? &ptrs +			: &keep, ob);  	wp->ptrs = keep;  	mutex_unlock(&wp->lock); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ff26bb515150..5f195d2280a4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,  static int backpointer_target_not_found(struct btree_trans *trans,  				  struct bkey_s_c_backpointer bp,  				  struct bkey_s_c target_k, -				  struct bkey_buf *last_flushed) +				  struct bkey_buf *last_flushed, +				  bool commit)  {  	struct bch_fs *c = trans->c;  	struct printbuf buf = PRINTBUF; @@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans,  		}  	if (fsck_err(trans, backpointer_to_missing_ptr, -		     "%s", buf.buf)) +		     "%s", buf.buf)) {  		ret = bch2_backpointer_del(trans, bp.k->p); +		if (ret || !commit) +			goto out; + +		/* +		 * Normally, on transaction commit from inside a transaction, +		 * we'll return -BCH_ERR_transaction_restart_nested, since a +		 * transaction commit invalidates pointers given out by peek(). +		 * +		 * However, since we're updating a write buffer btree, if we +		 * return a transaction restart and loop we won't see that the +		 * backpointer has been deleted without an additional write +		 * buffer flush - and those are expensive. +		 * +		 * So we're relying on the caller immediately advancing to the +		 * next backpointer and starting a new transaction immediately +		 * after backpointer_get_key() returns NULL: +		 */ +		ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +	} +out:  fsck_err:  	printbuf_exit(&buf);  	return ret;  } -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, -					 struct bkey_s_c_backpointer bp, -					 struct btree_iter *iter, -					 unsigned iter_flags, -					 struct bkey_buf *last_flushed) +static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, +						 struct bkey_s_c_backpointer bp, +						 struct btree_iter *iter, +						 struct bkey_buf *last_flushed, +						 bool commit) +{ +	struct bch_fs *c = trans->c; + +	BUG_ON(!bp.v->level); + +	bch2_trans_node_iter_init(trans, iter, +				  bp.v->btree_id, +				  bp.v->pos, +				  0, +				  bp.v->level - 1, +				  0); +	struct btree *b = bch2_btree_iter_peek_node(trans, iter); +	if (IS_ERR_OR_NULL(b)) +		goto err; + +	BUG_ON(b->c.level != bp.v->level - 1); + +	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, +			      bkey_i_to_s_c(&b->key), bp)) +		return b; + +	if (btree_node_will_make_reachable(b)) { +		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); +	} else { +		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), +						       last_flushed, commit); +		b = ret ? ERR_PTR(ret) : NULL; +	} +err: +	bch2_trans_iter_exit(trans, iter); +	return b; +} + +static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, +						  struct bkey_s_c_backpointer bp, +						  struct btree_iter *iter, +						  unsigned iter_flags, +						  struct bkey_buf *last_flushed, +						  bool commit)  {  	struct bch_fs *c = trans->c; @@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,  	bch2_trans_iter_exit(trans, iter);  	if (!bp.v->level) { -		int ret = backpointer_target_not_found(trans, bp, k, last_flushed); +		int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);  		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;  	} else { -		struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); +		struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);  		if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))  			return bkey_s_c_null;  		if (IS_ERR_OR_NULL(b)) @@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,  					struct btree_iter *iter,  					struct bkey_buf *last_flushed)  { -	struct bch_fs *c = trans->c; - -	BUG_ON(!bp.v->level); - -	bch2_trans_node_iter_init(trans, iter, -				  bp.v->btree_id, -				  bp.v->pos, -				  0, -				  bp.v->level - 1, -				  0); -	struct btree *b = bch2_btree_iter_peek_node(trans, iter); -	if (IS_ERR_OR_NULL(b)) -		goto err; - -	BUG_ON(b->c.level != bp.v->level - 1); - -	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, -			      bkey_i_to_s_c(&b->key), bp)) -		return b; +	return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); +} -	if (btree_node_will_make_reachable(b)) { -		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); -	} else { -		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); -		b = ret ? ERR_PTR(ret) : NULL; -	} -err: -	bch2_trans_iter_exit(trans, iter); -	return b; +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, +					 struct bkey_s_c_backpointer bp, +					 struct btree_iter *iter, +					 unsigned iter_flags, +					 struct bkey_buf *last_flushed) +{ +	return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);  }  static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, @@ -521,7 +562,7 @@ check_existing_bp:  	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);  	struct bkey_s_c other_extent = -		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); +		__bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);  	ret = bkey_err(other_extent);  	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)  		ret = 0; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 5d9f208a1bb7..75f7408da173 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -788,6 +788,8 @@ struct bch_fs {  		unsigned long	errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];  		u64		btrees_lost_data;  	}			sb; +	DARRAY(enum bcachefs_metadata_version) +				incompat_versions_requested;  #ifdef CONFIG_UNICODE  	struct unicode_map	*cf_encoding; @@ -981,8 +983,8 @@ struct bch_fs {  	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];  	size_t			zstd_workspace_size; -	struct crypto_sync_skcipher *chacha20; -	struct crypto_shash	*poly1305; +	struct bch_key		chacha20_key; +	bool			chacha20_key_set;  	atomic64_t		key_version; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a3db328dee31..d6e4a496f02b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)  #define __BKEY_PADDED(key, pad)					\  	struct bkey_i key; __u64 key ## _pad[pad] +enum bch_bkey_type_flags { +	BKEY_TYPE_strict_btree_checks	= BIT(0), +}; +  /*   * - DELETED keys are used internally to mark keys that should be ignored but   *   override keys in composition order.  Their version number is ignored. @@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)   *   * - WHITEOUT: for hash table btrees   */ -#define BCH_BKEY_TYPES()				\ -	x(deleted,		0)			\ -	x(whiteout,		1)			\ -	x(error,		2)			\ -	x(cookie,		3)			\ -	x(hash_whiteout,	4)			\ -	x(btree_ptr,		5)			\ -	x(extent,		6)			\ -	x(reservation,		7)			\ -	x(inode,		8)			\ -	x(inode_generation,	9)			\ -	x(dirent,		10)			\ -	x(xattr,		11)			\ -	x(alloc,		12)			\ -	x(quota,		13)			\ -	x(stripe,		14)			\ -	x(reflink_p,		15)			\ -	x(reflink_v,		16)			\ -	x(inline_data,		17)			\ -	x(btree_ptr_v2,		18)			\ -	x(indirect_inline_data,	19)			\ -	x(alloc_v2,		20)			\ -	x(subvolume,		21)			\ -	x(snapshot,		22)			\ -	x(inode_v2,		23)			\ -	x(alloc_v3,		24)			\ -	x(set,			25)			\ -	x(lru,			26)			\ -	x(alloc_v4,		27)			\ -	x(backpointer,		28)			\ -	x(inode_v3,		29)			\ -	x(bucket_gens,		30)			\ -	x(snapshot_tree,	31)			\ -	x(logged_op_truncate,	32)			\ -	x(logged_op_finsert,	33)			\ -	x(accounting,		34)			\ -	x(inode_alloc_cursor,	35) +#define BCH_BKEY_TYPES()						\ +	x(deleted,		0,	0)				\ +	x(whiteout,		1,	0)				\ +	x(error,		2,	0)				\ +	x(cookie,		3,	0)				\ +	x(hash_whiteout,	4,	BKEY_TYPE_strict_btree_checks)	\ +	x(btree_ptr,		5,	BKEY_TYPE_strict_btree_checks)	\ +	x(extent,		6,	BKEY_TYPE_strict_btree_checks)	\ +	x(reservation,		7,	BKEY_TYPE_strict_btree_checks)	\ +	x(inode,		8,	BKEY_TYPE_strict_btree_checks)	\ +	x(inode_generation,	9,	BKEY_TYPE_strict_btree_checks)	\ +	x(dirent,		10,	BKEY_TYPE_strict_btree_checks)	\ +	x(xattr,		11,	BKEY_TYPE_strict_btree_checks)	\ +	x(alloc,		12,	BKEY_TYPE_strict_btree_checks)	\ +	x(quota,		13,	BKEY_TYPE_strict_btree_checks)	\ +	x(stripe,		14,	BKEY_TYPE_strict_btree_checks)	\ +	x(reflink_p,		15,	BKEY_TYPE_strict_btree_checks)	\ +	x(reflink_v,		16,	BKEY_TYPE_strict_btree_checks)	\ +	x(inline_data,		17,	BKEY_TYPE_strict_btree_checks)	\ +	x(btree_ptr_v2,		18,	BKEY_TYPE_strict_btree_checks)	\ +	x(indirect_inline_data,	19,	BKEY_TYPE_strict_btree_checks)	\ +	x(alloc_v2,		20,	BKEY_TYPE_strict_btree_checks)	\ +	x(subvolume,		21,	BKEY_TYPE_strict_btree_checks)	\ +	x(snapshot,		22,	BKEY_TYPE_strict_btree_checks)	\ +	x(inode_v2,		23,	BKEY_TYPE_strict_btree_checks)	\ +	x(alloc_v3,		24,	BKEY_TYPE_strict_btree_checks)	\ +	x(set,			25,	0)				\ +	x(lru,			26,	BKEY_TYPE_strict_btree_checks)	\ +	x(alloc_v4,		27,	BKEY_TYPE_strict_btree_checks)	\ +	x(backpointer,		28,	BKEY_TYPE_strict_btree_checks)	\ +	x(inode_v3,		29,	BKEY_TYPE_strict_btree_checks)	\ +	x(bucket_gens,		30,	BKEY_TYPE_strict_btree_checks)	\ +	x(snapshot_tree,	31,	BKEY_TYPE_strict_btree_checks)	\ +	x(logged_op_truncate,	32,	BKEY_TYPE_strict_btree_checks)	\ +	x(logged_op_finsert,	33,	BKEY_TYPE_strict_btree_checks)	\ +	x(accounting,		34,	BKEY_TYPE_strict_btree_checks)	\ +	x(inode_alloc_cursor,	35,	BKEY_TYPE_strict_btree_checks)  enum bch_bkey_type { -#define x(name, nr) KEY_TYPE_##name	= nr, +#define x(name, nr, ...) KEY_TYPE_##name	= nr,  	BCH_BKEY_TYPES()  #undef x  	KEY_TYPE_MAX, @@ -863,6 +867,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,  LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);  LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);  LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR,	struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_CASEFOLD,		struct bch_sb, flags[6], 22, 23);  static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)  { diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 15c93576b5c2..00d05ccfaf73 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -21,7 +21,7 @@  #include "xattr.h"  const char * const bch2_bkey_types[] = { -#define x(name, nr) #name, +#define x(name, nr, ...) #name,  	BCH_BKEY_TYPES()  #undef x  	NULL @@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_  })  const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name, +#define x(name, nr, ...) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,  	BCH_BKEY_TYPES()  #undef x  }; @@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {  #undef x  }; +static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { +#define x(name, nr, flags)	[KEY_TYPE_##name] = flags, +	BCH_BKEY_TYPES() +#undef x +}; +  const char *bch2_btree_node_type_str(enum btree_node_type type)  {  	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); @@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,  	if (type >= BKEY_TYPE_NR)  		return 0; -	bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && -			 (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && +	enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX +		? bch2_bkey_type_flags[k.k->type] +		: 0; + +	bool strict_key_type_allowed = +		(from.flags & BCH_VALIDATE_commit) || +		type == BKEY_TYPE_btree || +		(from.btree < BTREE_ID_NR && +		 (bkey_flags & BKEY_TYPE_strict_btree_checks)); + +	bkey_fsck_err_on(strict_key_type_allowed && +			 k.k->type < KEY_TYPE_MAX &&  			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),  			 c, bkey_invalid_type_for_btree,  			 "invalid key type for btree %s (%s)", diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9b80201c7982..899891295797 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -852,7 +852,6 @@ out:  	b->sib_u64s[1]		= 0;  	b->whiteout_u64s	= 0;  	bch2_btree_keys_init(b); -	set_btree_node_accessed(b);  	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],  			       start_time); @@ -1286,6 +1285,10 @@ lock_node:  			six_unlock_read(&b->c.lock);  			goto retry;  		} + +		/* avoid atomic set bit if it's not needed: */ +		if (!btree_node_accessed(b)) +			set_btree_node_accessed(b);  	}  	/* XXX: waiting on IO with btree locks held: */ @@ -1301,10 +1304,6 @@ lock_node:  		prefetch(p + L1_CACHE_BYTES * 2);  	} -	/* avoid atomic set bit if it's not needed: */ -	if (!btree_node_accessed(b)) -		set_btree_node_accessed(b); -  	if (unlikely(btree_node_read_error(b))) {  		six_unlock_read(&b->c.lock);  		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 7b98ba2dec64..37b69d89341f 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -47,6 +47,27 @@  #define DROP_PREV_NODE		11  #define DID_FILL_FROM_SCAN	12 +/* + * Returns true if it's a btree we can easily reconstruct, or otherwise won't + * cause data loss if it's missing: + */ +static bool btree_id_important(enum btree_id btree) +{ +	if (btree_id_is_alloc(btree)) +		return false; + +	switch (btree) { +	case BTREE_ID_quotas: +	case BTREE_ID_snapshot_trees: +	case BTREE_ID_logged_ops: +	case BTREE_ID_rebalance_work: +	case BTREE_ID_subvolume_children: +		return false; +	default: +		return true; +	} +} +  static const char * const bch2_gc_phase_strs[] = {  #define x(n)	#n,  	GC_PHASES() @@ -534,8 +555,10 @@ reconstruct_root:  			r->error = 0;  			if (!bch2_btree_has_scanned_nodes(c, i)) { -				mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, -						 "no nodes found for btree %s, continue?", buf.buf); +				__fsck_err(trans, +					   FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), +					   btree_root_unreadable_and_scan_found_nothing, +					   "no nodes found for btree %s, continue?", buf.buf);  				bch2_btree_root_alloc_fake_trans(trans, i, 0);  			} else {  				bch2_btree_root_alloc_fake_trans(trans, i, 1); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5fd4a58d2ad2..60782f3e5aec 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -41,6 +41,7 @@ void bch2_btree_node_io_unlock(struct btree *b)  	clear_btree_node_write_in_flight_inner(b);  	clear_btree_node_write_in_flight(b); +	smp_mb__after_atomic();  	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);  } @@ -1400,6 +1401,7 @@ start:  	printbuf_exit(&buf);  	clear_btree_node_read_in_flight(b); +	smp_mb__after_atomic();  	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);  } @@ -1595,6 +1597,7 @@ fsck_err:  	printbuf_exit(&buf);  	clear_btree_node_read_in_flight(b); +	smp_mb__after_atomic();  	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);  } @@ -1721,6 +1724,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,  		set_btree_node_read_error(b);  		bch2_btree_lost_data(c, b->c.btree_id);  		clear_btree_node_read_in_flight(b); +		smp_mb__after_atomic();  		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);  		printbuf_exit(&buf);  		return; @@ -2061,8 +2065,10 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start  	if (new & (1U << BTREE_NODE_write_in_flight))  		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); -	else +	else { +		smp_mb__after_atomic();  		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +	}  }  static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) @@ -2175,6 +2181,7 @@ static void btree_node_write_endio(struct bio *bio)  	}  	clear_btree_node_write_in_flight_inner(b); +	smp_mb__after_atomic();  	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);  	INIT_WORK(&wb->work, btree_node_write_work);  	queue_work(c->btree_io_complete_wq, &wb->work); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e34e9598ef25..a873ec1baf58 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_  		return NULL;  	} +	/* +	 * We don't correctly handle nodes with extra intent locks here: +	 * downgrade so we don't violate locking invariants +	 */ +	bch2_btree_path_downgrade(trans, path); +  	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {  		__bch2_btree_path_unlock(trans, path);  		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock); @@ -2577,7 +2583,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct  					      struct bpos end)  {  	if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && -	   !bkey_eq(iter->pos, POS_MAX)) { +	   !bkey_eq(iter->pos, POS_MAX) && +	   !((iter->flags & BTREE_ITER_is_extents) && +	     iter->pos.offset == U64_MAX)) { +  		/*  		 * bkey_start_pos(), for extents, is not monotonically  		 * increasing until after filtering for snapshots: @@ -2602,7 +2611,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct  	bch2_trans_verify_not_unlocked_or_in_restart(trans);  	bch2_btree_iter_verify_entry_exit(iter); -	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); +	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);  	int ret = trans_maybe_inject_restart(trans, _RET_IP_);  	if (unlikely(ret)) { @@ -2740,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre  	ret = trans_maybe_inject_restart(trans, _RET_IP_);  	if (unlikely(ret)) {  		k = bkey_s_c_err(ret); -		goto out_no_locked; +		goto out;  	}  	/* extents can't span inode numbers: */ @@ -2760,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre  	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);  	if (unlikely(ret)) {  		k = bkey_s_c_err(ret); -		goto out_no_locked; +		goto out;  	}  	struct btree_path *path = btree_iter_path(trans, iter);  	if (unlikely(!btree_path_node(path, path->level)))  		return bkey_s_c_null; +	btree_path_set_should_be_locked(trans, path); +  	if ((iter->flags & BTREE_ITER_cached) ||  	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {  		k = bkey_s_c_null; @@ -2787,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre  			if (!bkey_err(k))  				iter->k = *k.k;  			/* We're not returning a key from iter->path: */ -			goto out_no_locked; +			goto out;  		} -		k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); +		k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);  		if (unlikely(!k.k)) -			goto out_no_locked; +			goto out;  		if (unlikely(k.k->type == KEY_TYPE_whiteout &&  			     (iter->flags & BTREE_ITER_filter_snapshots) && @@ -2830,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre  		}  		if (unlikely(bkey_err(k))) -			goto out_no_locked; +			goto out;  		next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2852,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre  		}  	}  out: -	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked:  	bch2_btree_iter_verify_entry_exit(iter);  	bch2_btree_iter_verify(trans, iter);  	ret = bch2_btree_iter_verify_ret(trans, iter, k); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index d1ad1a7613c9..ade3b5addd75 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -288,7 +288,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,  			.size			= max_t(size_t, keys->size, 8) * 2,  		}; -		new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); +		new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);  		if (!new_keys.data) {  			bch_err(c, "%s: error allocating new key array (size %zu)",  				__func__, new_keys.size); @@ -644,8 +644,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,   */  static int journal_sort_key_cmp(const void *_l, const void *_r)  { -	cond_resched(); -  	const struct journal_key *l = _l;  	const struct journal_key *r = _r; @@ -689,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c)  static void __journal_keys_sort(struct journal_keys *keys)  { -	sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); +	sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), +		       journal_sort_key_cmp, NULL);  	cond_resched(); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 8c9fdb7263fe..86acf037590c 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -183,7 +183,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,  		return;  	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { -		if (!c->chacha20) +		if (!c->chacha20_key_set)  			return;  		struct nonce nonce = btree_nonce(&bn->keys, 0); @@ -398,7 +398,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)  		bch2_print_string_as_lines(KERN_INFO, buf.buf);  	} -	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); +	sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);  	dst = 0;  	darray_for_each(f->nodes, i) { @@ -418,7 +418,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)  	}  	f->nodes.nr = dst; -	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +	sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);  	if (0 && c->opts.verbose) {  		printbuf_reset(&buf); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 55fbeeb8eaaa..00307356d7c8 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1221,7 +1221,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,  	ret = bch2_disk_reservation_get(c, &as->disk_res,  			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), -			c->opts.metadata_replicas, +			READ_ONCE(c->opts.metadata_replicas),  			disk_res_flags);  	if (ret)  		goto err; @@ -1389,7 +1389,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,  	printbuf_exit(&buf);  } -static void +static int  bch2_btree_insert_keys_interior(struct btree_update *as,  				struct btree_trans *trans,  				struct btree_path *path, @@ -1411,7 +1411,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as,  	     insert = bkey_next(insert))  		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); -	if (bch2_btree_node_check_topology(trans, b)) { +	int ret = bch2_btree_node_check_topology(trans, b); +	if (ret) {  		struct printbuf buf = PRINTBUF;  		for (struct bkey_i *k = keys->keys; @@ -1421,11 +1422,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as,  			prt_newline(&buf);  		} -		panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); +		bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", +				    (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); +		dump_stack(); +		return ret;  	}  	memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);  	keys->top_p -= insert->_data - keys->keys_p; +	return 0;  }  static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) @@ -1559,11 +1564,11 @@ static void __btree_split_node(struct btree_update *as,   * nodes that were coalesced, and thus in the middle of a child node post   * coalescing:   */ -static void btree_split_insert_keys(struct btree_update *as, -				    struct btree_trans *trans, -				    btree_path_idx_t path_idx, -				    struct btree *b, -				    struct keylist *keys) +static int btree_split_insert_keys(struct btree_update *as, +				   struct btree_trans *trans, +				   btree_path_idx_t path_idx, +				   struct btree *b, +				   struct keylist *keys)  {  	struct btree_path *path = trans->paths + path_idx; @@ -1573,8 +1578,12 @@ static void btree_split_insert_keys(struct btree_update *as,  		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); -		bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); +		int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); +		if (ret) +			return ret;  	} + +	return 0;  }  static int btree_split(struct btree_update *as, struct btree_trans *trans, @@ -1607,8 +1616,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,  		__btree_split_node(as, trans, b, n, keys);  		if (keys) { -			btree_split_insert_keys(as, trans, path, n1, keys); -			btree_split_insert_keys(as, trans, path, n2, keys); +			ret =   btree_split_insert_keys(as, trans, path, n1, keys) ?: +				btree_split_insert_keys(as, trans, path, n2, keys); +			if (ret) +				goto err;  			BUG_ON(!bch2_keylist_empty(keys));  		} @@ -1654,7 +1665,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,  			n3->sib_u64s[0] = U16_MAX;  			n3->sib_u64s[1] = U16_MAX; -			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); +			ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); +			if (ret) +				goto err;  		}  	} else {  		trace_and_count(c, btree_node_compact, trans, b); @@ -1662,7 +1675,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,  		n1 = bch2_btree_node_alloc_replacement(as, trans, b);  		if (keys) { -			btree_split_insert_keys(as, trans, path, n1, keys); +			ret = btree_split_insert_keys(as, trans, path, n1, keys); +			if (ret) +				goto err;  			BUG_ON(!bch2_keylist_empty(keys));  		} @@ -1809,15 +1824,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t  		goto split;  	} -	ret = bch2_btree_node_check_topology(trans, b); + +	ret =   bch2_btree_node_check_topology(trans, b) ?: +		bch2_btree_insert_keys_interior(as, trans, path, b, +					path->l[b->c.level].iter, keys);  	if (ret) {  		bch2_btree_node_unlock_write(trans, path, b);  		return ret;  	} -	bch2_btree_insert_keys_interior(as, trans, path, b, -					path->l[b->c.level].iter, keys); -  	trans_for_each_path_with_node(trans, b, linked, i)  		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index adbe576ec77e..0941fb2c026d 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -428,10 +428,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)  		 */  		trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); -		sort(wb->flushing.keys.data, -		     wb->flushing.keys.nr, -		     sizeof(wb->flushing.keys.data[0]), -		     wb_key_seq_cmp, NULL); +		sort_nonatomic(wb->flushing.keys.data, +			       wb->flushing.keys.nr, +			       sizeof(wb->flushing.keys.data[0]), +			       wb_key_seq_cmp, NULL);  		darray_for_each(wb->flushing.keys, i) {  			if (!i->journal_seq) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index fea61e60a9ee..31fbc2716d8b 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -37,7 +37,8 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)  void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)  {  	memset(usage, 0, sizeof(*usage)); -	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); +	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, +			sizeof(struct bch_dev_usage_full) / sizeof(u64));  }  static u64 reserve_factor(u64 r) @@ -603,6 +604,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,  	}  	struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); +	if (!bucket_valid(ca, bucket.offset)) { +		if (insert) { +			bch2_dev_bucket_missing(ca, bucket.offset); +			ret = -BCH_ERR_trigger_pointer; +		} +		goto err; +	}  	if (flags & BTREE_TRIGGER_transactional) {  		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); @@ -1306,13 +1314,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)  	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);  	if (resize) { -		bucket_gens->nbuckets = min(bucket_gens->nbuckets, -					    old_bucket_gens->nbuckets); -		bucket_gens->nbuckets_minus_first = -			bucket_gens->nbuckets - bucket_gens->first_bucket; +		u64 copy = min(bucket_gens->nbuckets, +			       old_bucket_gens->nbuckets);  		memcpy(bucket_gens->b,  		       old_bucket_gens->b, -		       bucket_gens->nbuckets); +		       sizeof(bucket_gens->b[0]) * copy);  	}  	rcu_assign_pointer(ca->bucket_gens, bucket_gens); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 1c38b165f48b..af1532de4a37 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -44,6 +44,7 @@ static inline void bucket_unlock(struct bucket *b)  	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);  	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); +	smp_mb__after_atomic();  	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);  } @@ -242,11 +243,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,  /* Filesystem usage: */ -static inline unsigned dev_usage_u64s(void) -{ -	return sizeof(struct bch_dev_usage) / sizeof(u64); -} -  struct bch_fs_usage_short  bch2_fs_usage_read_short(struct bch_fs *); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3726689093e3..d0a34a097b80 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -7,17 +7,12 @@  #include "super-io.h"  #include <linux/crc32c.h> -#include <linux/crypto.h>  #include <linux/xxhash.h>  #include <linux/key.h>  #include <linux/random.h>  #include <linux/ratelimit.h> -#include <linux/scatterlist.h> -#include <crypto/algapi.h>  #include <crypto/chacha.h> -#include <crypto/hash.h>  #include <crypto/poly1305.h> -#include <crypto/skcipher.h>  #include <keys/user-type.h>  /* @@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *  	}  } -static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, -				struct nonce nonce, -				struct scatterlist *sg, size_t len) +static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], +			       const struct bch_key *key, struct nonce nonce)  { -	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); +	u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; -	skcipher_request_set_sync_tfm(req, tfm); -	skcipher_request_set_callback(req, 0, NULL, NULL); -	skcipher_request_set_crypt(req, sg, sg, len, nonce.d); +	BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); +	memcpy(key_words, key, sizeof(key_words)); +	le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); -	int ret = crypto_skcipher_encrypt(req); -	if (ret) -		pr_err("got error %i from crypto_skcipher_encrypt()", ret); - -	return ret; -} - -static inline int do_encrypt(struct crypto_sync_skcipher *tfm, -			      struct nonce nonce, -			      void *buf, size_t len) -{ -	if (!is_vmalloc_addr(buf)) { -		struct scatterlist sg = {}; - -		sg_mark_end(&sg); -		sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); -		return do_encrypt_sg(tfm, nonce, &sg, len); -	} else { -		DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; -		size_t sgl_len = 0; -		int ret; - -		darray_init(&sgl); - -		while (len) { -			unsigned offset = offset_in_page(buf); -			struct scatterlist sg = { -				.page_link	= (unsigned long) vmalloc_to_page(buf), -				.offset		= offset, -				.length		= min(len, PAGE_SIZE - offset), -			}; +	BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); +	chacha_init(state, key_words, (const u8 *)nonce.d); -			if (darray_push(&sgl, sg)) { -				sg_mark_end(&darray_last(sgl)); -				ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); -				if (ret) -					goto err; - -				nonce = nonce_add(nonce, sgl_len); -				sgl_len = 0; -				sgl.nr = 0; -				BUG_ON(darray_push(&sgl, sg)); -			} - -			buf += sg.length; -			len -= sg.length; -			sgl_len += sg.length; -		} - -		sg_mark_end(&darray_last(sgl)); -		ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); -err: -		darray_exit(&sgl); -		return ret; -	} +	memzero_explicit(key_words, sizeof(key_words));  } -int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -			    void *buf, size_t len) +static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, +			  void *data, size_t len)  { -	struct crypto_sync_skcipher *chacha20 = -		crypto_alloc_sync_skcipher("chacha20", 0, 0); -	int ret; - -	ret = PTR_ERR_OR_ZERO(chacha20); -	if (ret) { -		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); -		return ret; -	} - -	ret = crypto_skcipher_setkey(&chacha20->base, -				     (void *) key, sizeof(*key)); -	if (ret) { -		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); -		goto err; -	} +	u32 state[CHACHA_STATE_WORDS]; -	ret = do_encrypt(chacha20, nonce, buf, len); -err: -	crypto_free_sync_skcipher(chacha20); -	return ret; +	bch2_chacha20_init(state, key, nonce); +	chacha20_crypt(state, data, data, len); +	memzero_explicit(state, sizeof(state));  } -static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -			struct nonce nonce) +static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, +			       struct bch_fs *c, struct nonce nonce)  { -	u8 key[POLY1305_KEY_SIZE]; -	int ret; +	u8 key[POLY1305_KEY_SIZE] = { 0 };  	nonce.d[3] ^= BCH_NONCE_POLY; -	memset(key, 0, sizeof(key)); -	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); -	if (ret) -		return ret; - -	desc->tfm = c->poly1305; -	crypto_shash_init(desc); -	crypto_shash_update(desc, key, sizeof(key)); -	return 0; +	bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); +	poly1305_init(desc, key);  }  struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,  	case BCH_CSUM_chacha20_poly1305_80:  	case BCH_CSUM_chacha20_poly1305_128: { -		SHASH_DESC_ON_STACK(desc, c->poly1305); +		struct poly1305_desc_ctx dctx;  		u8 digest[POLY1305_DIGEST_SIZE];  		struct bch_csum ret = { 0 }; -		gen_poly_key(c, desc, nonce); - -		crypto_shash_update(desc, data, len); -		crypto_shash_final(desc, digest); +		bch2_poly1305_init(&dctx, c, nonce); +		poly1305_update(&dctx, data, len); +		poly1305_final(&dctx, digest);  		memcpy(&ret, digest, bch_crc_bytes[type]);  		return ret; @@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,  	if (!bch2_csum_type_is_encryption(type))  		return 0; -	if (bch2_fs_inconsistent_on(!c->chacha20, +	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,  				    c, "attempting to encrypt without encryption key"))  		return -BCH_ERR_no_encryption_key; -	return do_encrypt(c->chacha20, nonce, data, len); +	bch2_chacha20(&c->chacha20_key, nonce, data, len); +	return 0;  }  static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,  	case BCH_CSUM_chacha20_poly1305_80:  	case BCH_CSUM_chacha20_poly1305_128: { -		SHASH_DESC_ON_STACK(desc, c->poly1305); +		struct poly1305_desc_ctx dctx;  		u8 digest[POLY1305_DIGEST_SIZE];  		struct bch_csum ret = { 0 }; -		gen_poly_key(c, desc, nonce); +		bch2_poly1305_init(&dctx, c, nonce);  #ifdef CONFIG_HIGHMEM  		__bio_for_each_segment(bv, bio, *iter, *iter) {  			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; -			crypto_shash_update(desc, p, bv.bv_len); +			poly1305_update(&dctx, p, bv.bv_len);  			kunmap_local(p);  		}  #else  		__bio_for_each_bvec(bv, bio, *iter, *iter) -			crypto_shash_update(desc, +			poly1305_update(&dctx,  				page_address(bv.bv_page) + bv.bv_offset,  				bv.bv_len);  #endif -		crypto_shash_final(desc, digest); +		poly1305_final(&dctx, digest);  		memcpy(&ret, digest, bch_crc_bytes[type]);  		return ret; @@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,  {  	struct bio_vec bv;  	struct bvec_iter iter; -	DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; -	size_t sgl_len = 0; +	u32 chacha_state[CHACHA_STATE_WORDS];  	int ret = 0; -	if (bch2_fs_inconsistent_on(!c->chacha20, +	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,  				    c, "attempting to encrypt without encryption key"))  		return -BCH_ERR_no_encryption_key; -	darray_init(&sgl); +	bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);  	bio_for_each_segment(bv, bio, iter) { -		struct scatterlist sg = { -			.page_link	= (unsigned long) bv.bv_page, -			.offset		= bv.bv_offset, -			.length		= bv.bv_len, -		}; - -		if (darray_push(&sgl, sg)) { -			sg_mark_end(&darray_last(sgl)); -			ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); -			if (ret) -				goto err; - -			nonce = nonce_add(nonce, sgl_len); -			sgl_len = 0; -			sgl.nr = 0; - -			BUG_ON(darray_push(&sgl, sg)); +		void *p; + +		/* +		 * chacha_crypt() assumes that the length is a multiple of +		 * CHACHA_BLOCK_SIZE on any non-final call. +		 */ +		if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { +			bch_err_ratelimited(c, "bio not aligned for encryption"); +			ret = -EIO; +			break;  		} -		sgl_len += sg.length; +		p = bvec_kmap_local(&bv); +		chacha20_crypt(chacha_state, p, p, bv.bv_len); +		kunmap_local(p);  	} - -	sg_mark_end(&darray_last(sgl)); -	ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); -err: -	darray_exit(&sgl); +	memzero_explicit(chacha_state, sizeof(chacha_state));  	return ret;  } @@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,  	}  	/* decrypt real key: */ -	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -				      &sb_key, sizeof(sb_key)); -	if (ret) -		goto err; +	bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));  	if (bch2_key_is_encrypted(&sb_key)) {  		bch_err(c, "incorrect encryption key"); @@ -668,31 +574,6 @@ err:  	return ret;  } -static int bch2_alloc_ciphers(struct bch_fs *c) -{ -	if (c->chacha20) -		return 0; - -	struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -	int ret = PTR_ERR_OR_ZERO(chacha20); -	if (ret) { -		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); -		return ret; -	} - -	struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); -	ret = PTR_ERR_OR_ZERO(poly1305); -	if (ret) { -		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); -		crypto_free_sync_skcipher(chacha20); -		return ret; -	} - -	c->chacha20	= chacha20; -	c->poly1305	= poly1305; -	return 0; -} -  #if 0  /* @@ -797,35 +678,21 @@ err:  void bch2_fs_encryption_exit(struct bch_fs *c)  { -	if (c->poly1305) -		crypto_free_shash(c->poly1305); -	if (c->chacha20) -		crypto_free_sync_skcipher(c->chacha20); +	memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));  }  int bch2_fs_encryption_init(struct bch_fs *c)  {  	struct bch_sb_field_crypt *crypt; -	struct bch_key key; -	int ret = 0; +	int ret;  	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);  	if (!crypt) -		goto out; +		return 0; -	ret = bch2_alloc_ciphers(c); +	ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);  	if (ret) -		goto out; - -	ret = bch2_decrypt_sb_key(c, crypt, &key); -	if (ret) -		goto out; - -	ret = crypto_skcipher_setkey(&c->chacha20->base, -			(void *) &key.key, sizeof(key.key)); -	if (ret) -		goto out; -out: -	memzero_explicit(&key, sizeof(key)); -	return ret; +		return ret; +	c->chacha20_key_set = true; +	return 0;  } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 4ac251c8fcd8..1310782d3ae9 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,7 +69,6 @@ static inline void bch2_csum_err_msg(struct printbuf *out,  	bch2_csum_to_text(out, type, expected);  } -int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);  int bch2_request_key(struct bch_sb *, struct bch_key *);  #ifndef __KERNEL__  int bch2_revoke_key(struct bch_sb *); @@ -156,7 +155,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,  	if (type >= BCH_CSUM_NR)  		return false; -	if (bch2_csum_type_is_encryption(type) && !c->chacha20) +	if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)  		return false;  	return true; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index de02ebf847ec..b211c97238ab 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -607,7 +607,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update  	prt_newline(out);  	printbuf_indent_add(out, 2);  	bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); -	prt_printf(out, "read_done:\t\%u\n", m->read_done); +	prt_printf(out, "read_done:\t%u\n", m->read_done);  	bch2_write_op_to_text(out, &m->op);  	printbuf_indent_sub(out, 2);  } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index bf53a029f356..8a680e52c1ed 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,8 +13,8 @@  #include <linux/dcache.h> -static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, -				const struct qstr *str, struct qstr *out_cf) +int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, +		  const struct qstr *str, struct qstr *out_cf)  {  	*out_cf = (struct qstr) QSTR_INIT(NULL, 0); @@ -35,18 +35,6 @@ static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *  #endif  } -static inline int bch2_maybe_casefold(struct btree_trans *trans, -				      const struct bch_hash_info *info, -				      const struct qstr *str, struct qstr *out_cf) -{ -	if (likely(!info->cf_encoding)) { -		*out_cf = *str; -		return 0; -	} else { -		return bch2_casefold(trans, info, str, out_cf); -	} -} -  static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)  {  	if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) @@ -287,8 +275,8 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,  	EBUG_ON(!dirent->v.d_casefold);  	EBUG_ON(!cf_name->len); -	dirent->v.d_cf_name_block.d_name_len = name->len; -	dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; +	dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); +	dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);  	memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);  	memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);  	memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, @@ -697,7 +685,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv  		      vfs_d_type(d.v->d_type));  	if (ret)  		ctx->pos = d.k->p.offset + 1; -	return ret; +	return !ret;  }  int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) @@ -722,7 +710,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)  			if (ret2 > 0)  				continue; -			ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); +			ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));  		})));  	bch2_bkey_buf_exit(&sk, c); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 0880772b80a9..9838a7ba7ed1 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,6 +23,21 @@ struct bch_fs;  struct bch_hash_info;  struct bch_inode_info; +int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, +		  const struct qstr *, struct qstr *); + +static inline int bch2_maybe_casefold(struct btree_trans *trans, +				      const struct bch_hash_info *info, +				      const struct qstr *str, struct qstr *out_cf) +{ +	if (likely(!info->cf_encoding)) { +		*out_cf = *str; +		return 0; +	} else { +		return bch2_casefold(trans, info, str, out_cf); +	} +} +  struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);  static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b007319b72e9..1f0422bfae35 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,  	return ret;  } +int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, +			       enum bch_accounting_mode mode) +{ +	struct bch_replicas_padded r; + +	if (mode != BCH_ACCOUNTING_read && +	    accounting_to_replicas(&r.e, a.k->p) && +	    !bch2_replicas_marked_locked(c, &r.e)) +		return -BCH_ERR_btree_insert_need_mark_replicas; + +	return __bch2_accounting_mem_insert(c, a); +} +  static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)  {  	for (unsigned i = 0; i < e->nr_counters; i++) @@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)  					accounting_key_init(&k_i.k, &acc_k, src_v, nr);  					bch2_accounting_mem_mod_locked(trans,  								bkey_i_to_s_c_accounting(&k_i.k), -								BCH_ACCOUNTING_normal); +								BCH_ACCOUNTING_normal, true);  					preempt_disable();  					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)  	percpu_down_read(&c->mark_lock);  	int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), -						 BCH_ACCOUNTING_read); +						 BCH_ACCOUNTING_read, false);  	percpu_up_read(&c->mark_lock);  	return ret;  } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index abb1f6206fe9..d557b99b3c0a 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -136,6 +136,7 @@ enum bch_accounting_mode {  };  int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); +int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);  void bch2_accounting_mem_gc(struct bch_fs *);  static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) @@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)   */  static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,  						 struct bkey_s_c_accounting a, -						 enum bch_accounting_mode mode) +						 enum bch_accounting_mode mode, +						 bool write_locked)  {  	struct bch_fs *c = trans->c;  	struct bch_accounting_mem *acc = &c->accounting; @@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,  	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),  				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { -		int ret = bch2_accounting_mem_insert(c, a, mode); +		int ret = 0; +		if (unlikely(write_locked)) +			ret = bch2_accounting_mem_insert_locked(c, a, mode); +		else +			ret = bch2_accounting_mem_insert(c, a, mode);  		if (ret)  			return ret;  	} @@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,  static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)  {  	percpu_down_read(&trans->c->mark_lock); -	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); +	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);  	percpu_up_read(&trans->c->mark_lock);  	return ret;  } @@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,  	EBUG_ON(bversion_zero(a->k.bversion));  	return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) -		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) +		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)  		: 0;  } @@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans  		struct bkey_s_accounting a = accounting_i_to_s(a_i);  		bch2_accounting_neg(a); -		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); +		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);  		bch2_accounting_neg(a);  	}  } diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 1186280b29e9..2ca3cbf12b71 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -470,23 +470,22 @@ inval:  int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)  { -	struct bch_member *mi; -	int ret, v = -1; +	lockdep_assert_held(&c->sb_lock); -	if (!strlen(name) || !strcmp(name, "none")) -		return 0; -	v = bch2_disk_path_find_or_create(&c->disk_sb, name); -	if (v < 0) -		return v; +	if (!strlen(name) || !strcmp(name, "none")) { +		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +		SET_BCH_MEMBER_GROUP(mi, 0); +	} else { +		int v = bch2_disk_path_find_or_create(&c->disk_sb, name); +		if (v < 0) +			return v; -	ret = bch2_sb_disk_groups_to_cpu(c); -	if (ret) -		return ret; +		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +		SET_BCH_MEMBER_GROUP(mi, v + 1); +	} -	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -	SET_BCH_MEMBER_GROUP(mi, v + 1); -	return 0; +	return bch2_sb_disk_groups_to_cpu(c);  }  int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index a396865e8b17..fff58b78327c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2204,10 +2204,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)  static bool bch2_fs_ec_flush_done(struct bch_fs *c)  { -	bool ret; +	sched_annotate_sleep();  	mutex_lock(&c->ec_stripe_new_lock); -	ret = list_empty(&c->ec_stripe_new_list); +	bool ret = list_empty(&c->ec_stripe_new_list);  	mutex_unlock(&c->ec_stripe_new_lock);  	return ret; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 62d27e04d763..51893e1ee874 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -160,6 +160,7 @@ static inline void gc_stripe_unlock(struct gc_stripe *s)  	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);  	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); +	smp_mb__after_atomic();  	wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);  } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c8696f01eb14..d9ebffa5b3a2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -269,7 +269,7 @@  	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\  	x(BCH_ERR_invalid,		invalid_bkey)				\  	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\ -	x(EIO,				journal_shutdown)			\ +	x(EROFS,			journal_shutdown)			\  	x(EIO,				journal_flush_err)			\  	x(EIO,				journal_write_err)			\  	x(EIO,				btree_node_read_err)			\ @@ -287,7 +287,7 @@  	x(EIO,				mark_stripe)				\  	x(EIO,				stripe_reconstruct)			\  	x(EIO,				key_type_error)				\ -	x(EIO,				extent_poisened)			\ +	x(EIO,				extent_poisoned)			\  	x(EIO,				missing_indirect_extent)		\  	x(EIO,				invalidate_stripe_to_dev)		\  	x(EIO,				no_encryption_key)			\ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index baf5dfb32298..6b8695b1349c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -272,9 +272,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c,  {  	struct fsck_err_state *s; -	if (!test_bit(BCH_FS_fsck_running, &c->flags)) -		return NULL; -  	list_for_each_entry(s, &c->fsck_error_msgs, list)  		if (s->id == id) {  			/* @@ -481,7 +478,9 @@ int __bch2_fsck_err(struct bch_fs *c,  	} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {  		if (c->opts.errors != BCH_ON_ERROR_continue ||  		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { -			prt_str(out, ", shutting down"); +			prt_str_indented(out, ", shutting down\n" +					 "error not marked as autofix and not in fsck\n" +					 "run fsck, and forward to devs so error can be marked for self-healing");  			inconsistent = true;  			print = true;  			ret = -BCH_ERR_fsck_errors_not_fixed; @@ -639,14 +638,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,  	return ret;  } -void bch2_flush_fsck_errs(struct bch_fs *c) +static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)  {  	struct fsck_err_state *s, *n;  	mutex_lock(&c->fsck_error_msgs_lock);  	list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { -		if (s->ratelimited && s->last_msg) +		if (print && s->ratelimited && s->last_msg)  			bch_err(c, "Saw %llu errors like:\n  %s", s->nr, s->last_msg);  		list_del(&s->list); @@ -657,6 +656,16 @@ void bch2_flush_fsck_errs(struct bch_fs *c)  	mutex_unlock(&c->fsck_error_msgs_lock);  } +void bch2_flush_fsck_errs(struct bch_fs *c) +{ +	__bch2_flush_fsck_errs(c, true); +} + +void bch2_free_fsck_errs(struct bch_fs *c) +{ +	__bch2_flush_fsck_errs(c, false); +} +  int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,  				    subvol_inum inum, u64 offset)  { diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index d0d024dc714b..4a364fd44abe 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -93,6 +93,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,  			_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)  void bch2_flush_fsck_errs(struct bch_fs *); +void bch2_free_fsck_errs(struct bch_fs *);  #define fsck_err_wrap(_do)						\  ({									\ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index ae7c7a177e10..e597fb9c9823 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -139,7 +139,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,  	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);  	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) -		return -BCH_ERR_extent_poisened; +		return -BCH_ERR_extent_poisoned;  	rcu_read_lock();  	const union bch_extent_entry *entry; @@ -1056,8 +1056,9 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke  static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,  			    struct bch_extent_ptr *ptr)  { -	if (!opts->promote_target || -	    !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) +	unsigned target = opts->promote_target ?: opts->foreground_target; + +	if (target && !bch2_dev_in_target(c, ptr->dev, target))  		return false;  	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 19d4599918dc..e3a75dcca60c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -225,11 +225,26 @@ static void bchfs_read(struct btree_trans *trans,  		bch2_read_extent(trans, rbio, iter.pos,  				 data_btree, k, offset_into_extent, flags); -		swap(rbio->bio.bi_iter.bi_size, bytes); +		/* +		 * Careful there's a landmine here if bch2_read_extent() ever +		 * starts returning transaction restarts here. +		 * +		 * We've changed rbio->bi_iter.bi_size to be "bytes we can read +		 * from this extent" with the swap call, and we restore it +		 * below. That restore needs to come before checking for +		 * errors. +		 * +		 * But unlike __bch2_read(), we use the rbio bvec iter, not one +		 * on the stack, so we can't do the restore right after the +		 * bch2_read_extent() call: we don't own that iterator anymore +		 * if BCH_READ_last_fragment is set, since we may have submitted +		 * that rbio instead of cloning it. +		 */  		if (flags & BCH_READ_last_fragment)  			break; +		swap(rbio->bio.bi_iter.bi_size, bytes);  		bio_advance(&rbio->bio, bytes);  err:  		if (ret && diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 65c2c33d253d..9657144666b8 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -144,10 +144,25 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,  void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,  			   struct quota_res *quota_res, s64 sectors)  { -	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, -				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", -				inode->v.i_ino, (u64) inode->v.i_blocks, sectors, -				inode->ei_inode.bi_sectors); +	if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { +		struct printbuf buf = PRINTBUF; +		bch2_log_msg_start(c, &buf); +		prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", +			   inode->v.i_ino, (u64) inode->v.i_blocks, sectors, +			   inode->ei_inode.bi_sectors); + +		bool repeat = false, print = false, suppress = false; +		bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress); +		if (print) +			bch2_print_str(c, buf.buf); +		printbuf_exit(&buf); + +		if (sectors < 0) +			sectors = -inode->v.i_blocks; +		else +			sectors = 0; +	} +  	inode->v.i_blocks += sectors;  #ifdef CONFIG_BCACHEFS_QUOTA @@ -502,11 +517,22 @@ int bchfs_truncate(struct mnt_idmap *idmap,  		goto err;  	} -	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && -				!bch2_journal_error(&c->journal), c, -				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", -				inode->v.i_ino, (u64) inode->v.i_blocks, -				inode->ei_inode.bi_sectors); +	if (unlikely(!inode->v.i_size && inode->v.i_blocks && +		     !bch2_journal_error(&c->journal))) { +		struct printbuf buf = PRINTBUF; +		bch2_log_msg_start(c, &buf); +		prt_printf(&buf, +			   "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", +			   inode->v.i_ino, (u64) inode->v.i_blocks, +			   inode->ei_inode.bi_sectors); + +		bool repeat = false, print = false, suppress = false; +		bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf, +				    &repeat, &print, &suppress); +		if (print) +			bch2_print_str(c, buf.buf); +		printbuf_exit(&buf); +	}  	ret = bch2_setattr_nonsize(idmap, inode, iattr);  err: diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index c1553e44e049..a82dfce9e4ad 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -21,206 +21,6 @@  #define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */  #define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */ -struct flags_set { -	unsigned		mask; -	unsigned		flags; - -	unsigned		projid; - -	bool			set_projinherit; -	bool			projinherit; -}; - -static int bch2_inode_flags_set(struct btree_trans *trans, -				struct bch_inode_info *inode, -				struct bch_inode_unpacked *bi, -				void *p) -{ -	struct bch_fs *c = inode->v.i_sb->s_fs_info; -	/* -	 * We're relying on btree locking here for exclusion with other ioctl -	 * calls - use the flags in the btree (@bi), not inode->i_flags: -	 */ -	struct flags_set *s = p; -	unsigned newflags = s->flags; -	unsigned oldflags = bi->bi_flags & s->mask; - -	if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && -	    !capable(CAP_LINUX_IMMUTABLE)) -		return -EPERM; - -	if (!S_ISREG(bi->bi_mode) && -	    !S_ISDIR(bi->bi_mode) && -	    (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) -		return -EINVAL; - -	if ((newflags ^ oldflags) & BCH_INODE_casefolded) { -#ifdef CONFIG_UNICODE -		int ret = 0; -		/* Not supported on individual files. */ -		if (!S_ISDIR(bi->bi_mode)) -			return -EOPNOTSUPP; - -		/* -		 * Make sure the dir is empty, as otherwise we'd need to -		 * rehash everything and update the dirent keys. -		 */ -		ret = bch2_empty_dir_trans(trans, inode_inum(inode)); -		if (ret < 0) -			return ret; - -		ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); -		if (ret) -			return ret; - -		bch2_check_set_feature(c, BCH_FEATURE_casefolding); -#else -		printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); -		return -EOPNOTSUPP; -#endif -	} - -	if (s->set_projinherit) { -		bi->bi_fields_set &= ~(1 << Inode_opt_project); -		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); -	} - -	bi->bi_flags &= ~s->mask; -	bi->bi_flags |= newflags; - -	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -	return 0; -} - -static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -{ -	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); - -	return put_user(flags, arg); -} - -static int bch2_ioc_setflags(struct bch_fs *c, -			     struct file *file, -			     struct bch_inode_info *inode, -			     void __user *arg) -{ -	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -	unsigned uflags; -	int ret; - -	if (get_user(uflags, (int __user *) arg)) -		return -EFAULT; - -	s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -	if (uflags) -		return -EOPNOTSUPP; - -	ret = mnt_want_write_file(file); -	if (ret) -		return ret; - -	inode_lock(&inode->v); -	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -		ret = -EACCES; -		goto setflags_out; -	} - -	mutex_lock(&inode->ei_update_lock); -	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: -		bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -			       ATTR_CTIME); -	mutex_unlock(&inode->ei_update_lock); - -setflags_out: -	inode_unlock(&inode->v); -	mnt_drop_write_file(file); -	return ret; -} - -static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -			       struct fsxattr __user *arg) -{ -	struct fsxattr fa = { 0 }; - -	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); - -	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) -		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; - -	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - -	if (copy_to_user(arg, &fa, sizeof(fa))) -		return -EFAULT; - -	return 0; -} - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, -				      struct bch_inode_info *inode, -				      struct bch_inode_unpacked *bi, -				      void *p) -{ -	struct flags_set *s = p; - -	if (s->projid != bi->bi_project) { -		bi->bi_fields_set |= 1U << Inode_opt_project; -		bi->bi_project = s->projid; -	} - -	return bch2_inode_flags_set(trans, inode, bi, p); -} - -static int bch2_ioc_fssetxattr(struct bch_fs *c, -			       struct file *file, -			       struct bch_inode_info *inode, -			       struct fsxattr __user *arg) -{ -	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -	struct fsxattr fa; -	int ret; - -	if (copy_from_user(&fa, arg, sizeof(fa))) -		return -EFAULT; - -	s.set_projinherit = true; -	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; -	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - -	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -	if (fa.fsx_xflags) -		return -EOPNOTSUPP; - -	if (fa.fsx_projid >= U32_MAX) -		return -EINVAL; - -	/* -	 * inode fields accessible via the xattr interface are stored with a +1 -	 * bias, so that 0 means unset: -	 */ -	s.projid = fa.fsx_projid + 1; - -	ret = mnt_want_write_file(file); -	if (ret) -		return ret; - -	inode_lock(&inode->v); -	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -		ret = -EACCES; -		goto err; -	} - -	mutex_lock(&inode->ei_update_lock); -	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: -		bch2_set_projid(c, inode, fa.fsx_projid) ?: -		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -			       ATTR_CTIME); -	mutex_unlock(&inode->ei_update_lock); -err: -	inode_unlock(&inode->v); -	mnt_drop_write_file(file); -	return ret; -} -  static int bch2_reinherit_attrs_fn(struct btree_trans *trans,  				   struct bch_inode_info *inode,  				   struct bch_inode_unpacked *bi, @@ -558,23 +358,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)  	long ret;  	switch (cmd) { -	case FS_IOC_GETFLAGS: -		ret = bch2_ioc_getflags(inode, (int __user *) arg); -		break; - -	case FS_IOC_SETFLAGS: -		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); -		break; - -	case FS_IOC_FSGETXATTR: -		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); -		break; - -	case FS_IOC_FSSETXATTR: -		ret = bch2_ioc_fssetxattr(c, file, inode, -					  (void __user *) arg); -		break; -  	case BCHFS_IOC_REINHERIT_ATTRS:  		ret = bch2_ioc_reinherit_attrs(c, file, inode,  					       (void __user *) arg); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index ecd3bfdcde21..a657e4994b71 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -2,81 +2,6 @@  #ifndef _BCACHEFS_FS_IOCTL_H  #define _BCACHEFS_FS_IOCTL_H -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const __maybe_unused unsigned bch_flags_to_vfs[] = { -	[__BCH_INODE_sync]		= S_SYNC, -	[__BCH_INODE_immutable]		= S_IMMUTABLE, -	[__BCH_INODE_append]		= S_APPEND, -	[__BCH_INODE_noatime]		= S_NOATIME, -	[__BCH_INODE_casefolded]	= S_CASEFOLD, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { -	[__BCH_INODE_sync]		= FS_SYNC_FL, -	[__BCH_INODE_immutable]		= FS_IMMUTABLE_FL, -	[__BCH_INODE_append]		= FS_APPEND_FL, -	[__BCH_INODE_nodump]		= FS_NODUMP_FL, -	[__BCH_INODE_noatime]		= FS_NOATIME_FL, -	[__BCH_INODE_casefolded]	= FS_CASEFOLD_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { -	[__BCH_INODE_sync]	= FS_XFLAG_SYNC, -	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE, -	[__BCH_INODE_append]	= FS_XFLAG_APPEND, -	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP, -	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME, -	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out)					\ -do {									\ -	unsigned _i;							\ -									\ -	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\ -		if ((_in) & (1 << _i))					\ -			(_out) |= _map[_i];				\ -		else							\ -			(_out) &= ~_map[_i];				\ -} while (0) - -#define map_flags(_map, _in)						\ -({									\ -	unsigned _out = 0;						\ -									\ -	set_flags(_map, _in, _out);					\ -	_out;								\ -}) - -#define map_flags_rev(_map, _in)					\ -({									\ -	unsigned _i, _out = 0;						\ -									\ -	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\ -		if ((_in) & _map[_i]) {					\ -			(_out) |= 1 << _i;				\ -			(_in) &= ~_map[_i];				\ -		}							\ -	(_out);								\ -}) - -#define map_defined(_map)						\ -({									\ -	unsigned _in = ~0;						\ -									\ -	map_flags_rev(_map, _in);					\ -}) - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ -	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} -  long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);  long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5a41b1a8e54f..4b742e62255b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -33,6 +33,7 @@  #include <linux/backing-dev.h>  #include <linux/exportfs.h>  #include <linux/fiemap.h> +#include <linux/fileattr.h>  #include <linux/fs_context.h>  #include <linux/module.h>  #include <linux/pagemap.h> @@ -51,6 +52,24 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,  				struct bch_inode_unpacked *,  				struct bch_subvolume *); +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) +{ +	static const __maybe_unused unsigned bch_flags_to_vfs[] = { +		[__BCH_INODE_sync]		= S_SYNC, +		[__BCH_INODE_immutable]		= S_IMMUTABLE, +		[__BCH_INODE_append]		= S_APPEND, +		[__BCH_INODE_noatime]		= S_NOATIME, +	}; + +	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); + +	if (bch2_inode_casefold(c, &inode->ei_inode)) +		inode->v.i_flags |= S_CASEFOLD; +	else +		inode->v.i_flags &= ~S_CASEFOLD; +} +  void bch2_inode_update_after_write(struct btree_trans *trans,  				   struct bch_inode_info *inode,  				   struct bch_inode_unpacked *bi, @@ -79,7 +98,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,  	inode->ei_inode		= *bi; -	bch2_inode_flags_to_vfs(inode); +	bch2_inode_flags_to_vfs(c, inode);  }  int __must_check bch2_write_inode(struct bch_fs *c, @@ -631,13 +650,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,  			const struct qstr *name)  {  	struct bch_fs *c = trans->c; -	struct btree_iter dirent_iter = {};  	subvol_inum inum = {};  	struct printbuf buf = PRINTBUF; +	struct qstr lookup_name; +	int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); +	if (ret) +		return ERR_PTR(ret); + +	struct btree_iter dirent_iter = {};  	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, -					     dir_hash_info, dir, name, 0); -	int ret = bkey_err(k); +					     dir_hash_info, dir, &lookup_name, 0); +	ret = bkey_err(k);  	if (ret)  		return ERR_PTR(ret); @@ -825,6 +849,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,  		 */  		set_nlink(&inode->v, 0);  	} + +	if (IS_CASEFOLDED(vdir)) +		d_invalidate(dentry);  err:  	bch2_trans_put(trans);  	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -1235,10 +1262,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,  	return finish_open_simple(file, 0);  } +struct bch_fiemap_extent { +	struct bkey_buf	kbuf; +	unsigned	flags; +}; +  static int bch2_fill_extent(struct bch_fs *c,  			    struct fiemap_extent_info *info, -			    struct bkey_s_c k, unsigned flags) +			    struct bch_fiemap_extent *fe)  { +	struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); +	unsigned flags = fe->flags; + +	BUG_ON(!k.k->size); +  	if (bkey_extent_is_direct_data(k.k)) {  		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);  		const union bch_extent_entry *entry; @@ -1291,110 +1328,225 @@ static int bch2_fill_extent(struct bch_fs *c,  	}  } -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -		       u64 start, u64 len) +/* + * Scan a range of an inode for data in pagecache. + * + * Intended to be retryable, so don't modify the output params until success is + * imminent. + */ +static int +bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, +			   bool nonblock)  { -	struct bch_fs *c = vinode->i_sb->s_fs_info; -	struct bch_inode_info *ei = to_bch_ei(vinode); -	struct btree_trans *trans; -	struct btree_iter iter; -	struct bkey_s_c k; -	struct bkey_buf cur, prev; -	bool have_extent = false; -	int ret = 0; +	loff_t	dstart, dend; -	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); -	if (ret) +	dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); +	if (dstart < 0) +		return dstart; + +	if (dstart == *end) { +		*start = dstart; +		return 0; +	} + +	dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); +	if (dend < 0) +		return dend; + +	/* race */ +	BUG_ON(dstart == dend); + +	*start = dstart; +	*end = dend; +	return 0; +} + +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If data is found, fake up an extent key so it looks like a + * delalloc extent to the rest of the fiemap processing code. + */ +static int +bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, +				  u64 start, u64 end, struct bch_fiemap_extent *cur) +{ +	struct bch_fs		*c = trans->c; +	struct bkey_i_extent	*delextent; +	struct bch_extent_ptr	ptr = {}; +	loff_t			dstart = start << 9, dend = end << 9; +	int			ret; + +	/* +	 * We hold btree locks here so we cannot block on folio locks without +	 * dropping trans locks first. Run a nonblocking scan for the common +	 * case of no folios over holes and fall back on failure. +	 * +	 * Note that dropping locks like this is technically racy against +	 * writeback inserting to the extent tree, but a non-sync fiemap scan is +	 * fundamentally racy with writeback anyways. Therefore, just report the +	 * range as delalloc regardless of whether we have to cycle trans locks. +	 */ +	ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); +	if (ret == -EAGAIN) +		ret = drop_locks_do(trans, +			bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); +	if (ret < 0)  		return ret; -	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -	if (start + len < start) -		return -EINVAL; +	/* +	 * Create a fake extent key in the buffer. We have to add a dummy extent +	 * pointer for the fill code to add an extent entry. It's explicitly +	 * zeroed to reflect delayed allocation (i.e. phys offset 0). +	 */ +	bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); +	delextent = bkey_extent_init(cur->kbuf.k); +	delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); +	delextent->k.size = (dend - dstart) >> 9; +	bch2_bkey_append_ptr(&delextent->k_i, ptr); -	start >>= 9; +	cur->flags = FIEMAP_EXTENT_DELALLOC; -	bch2_bkey_buf_init(&cur); -	bch2_bkey_buf_init(&prev); -	trans = bch2_trans_get(c); +	return 0; +} + +static int bch2_next_fiemap_extent(struct btree_trans *trans, +				   struct bch_inode_info *inode, +				   u64 start, u64 end, +				   struct bch_fiemap_extent *cur) +{ +	u32 snapshot; +	int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); +	if (ret) +		return ret; +	struct btree_iter iter;  	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, -			     POS(ei->v.i_ino, start), 0); +			     SPOS(inode->ei_inum.inum, start, snapshot), 0); -	while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -		enum btree_id data_btree = BTREE_ID_extents; +	struct bkey_s_c k = +		bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); +	ret = bkey_err(k); +	if (ret) +		goto err; -		bch2_trans_begin(trans); +	u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; -		u32 snapshot; -		ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); -		if (ret) -			continue; +	ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); +	if (ret) +		goto err; -		bch2_btree_iter_set_snapshot(trans, &iter, snapshot); +	struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); -		k = bch2_btree_iter_peek_max(trans, &iter, end); -		ret = bkey_err(k); +	/* +	 * Does the pagecache or the btree take precedence? +	 * +	 * It _should_ be the pagecache, so that we correctly report delalloc +	 * extents when dirty in the pagecache (we're COW, after all). +	 * +	 * But we'd have to add per-sector writeback tracking to +	 * bch_folio_state, otherwise we report delalloc extents for clean +	 * cached data in the pagecache. +	 * +	 * We should do this, but even then fiemap won't report stable mappings: +	 * on bcachefs data moves around in the background (copygc, rebalance) +	 * and we don't provide a way for userspace to lock that out. +	 */ +	if (k.k && +	    bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), +		    pagecache_start)) { +		bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); +		bch2_cut_front(iter.pos, cur->kbuf.k); +		bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); +		cur->flags = 0; +	} else if (k.k) { +		bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); +	} + +	if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { +		unsigned sectors = cur->kbuf.k->k.size; +		s64 offset_into_extent = 0; +		enum btree_id data_btree = BTREE_ID_extents; +		ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, +						&cur->kbuf);  		if (ret) -			continue; +			goto err; -		if (!k.k) -			break; +		struct bkey_i *k = cur->kbuf.k; +		sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); -		if (!bkey_extent_is_data(k.k) && -		    k.k->type != KEY_TYPE_reservation) { -			bch2_btree_iter_advance(trans, &iter); -			continue; -		} +		bch2_cut_front(POS(k->k.p.inode, +				   bkey_start_offset(&k->k) + offset_into_extent), +			       k); +		bch2_key_resize(&k->k, sectors); +		k->k.p = iter.pos; +		k->k.p.offset += k->k.size; +	} +err: +	bch2_trans_iter_exit(trans, &iter); +	return ret; +} + +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +		       u64 start, u64 len) +{ +	struct bch_fs *c = vinode->i_sb->s_fs_info; +	struct bch_inode_info *ei = to_bch_ei(vinode); +	struct btree_trans *trans; +	struct bch_fiemap_extent cur, prev; +	int ret = 0; + +	ret = fiemap_prep(&ei->v, info, start, &len, 0); +	if (ret) +		return ret; + +	if (start + len < start) +		return -EINVAL; + +	start >>= 9; +	u64 end = (start + len) >> 9; -		s64 offset_into_extent	= iter.pos.offset - bkey_start_offset(k.k); -		unsigned sectors	= k.k->size - offset_into_extent; +	bch2_bkey_buf_init(&cur.kbuf); +	bch2_bkey_buf_init(&prev.kbuf); +	bkey_init(&prev.kbuf.k->k); -		bch2_bkey_buf_reassemble(&cur, c, k); +	trans = bch2_trans_get(c); -		ret = bch2_read_indirect_extent(trans, &data_btree, -					&offset_into_extent, &cur); +	while (start < end) { +		ret = lockrestart_do(trans, +			bch2_next_fiemap_extent(trans, ei, start, end, &cur));  		if (ret) -			continue; +			goto err; -		k = bkey_i_to_s_c(cur.k); -		bch2_bkey_buf_realloc(&prev, c, k.k->u64s); +		BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); +		BUG_ON(cur.kbuf.k->k.p.offset > end); -		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); +		if (bkey_start_offset(&cur.kbuf.k->k) == end) +			break; -		bch2_cut_front(POS(k.k->p.inode, -				   bkey_start_offset(k.k) + -				   offset_into_extent), -			       cur.k); -		bch2_key_resize(&cur.k->k, sectors); -		cur.k->k.p = iter.pos; -		cur.k->k.p.offset += cur.k->k.size; +		start = cur.kbuf.k->k.p.offset; -		if (have_extent) { +		if (!bkey_deleted(&prev.kbuf.k->k)) {  			bch2_trans_unlock(trans); -			ret = bch2_fill_extent(c, info, -					bkey_i_to_s_c(prev.k), 0); +			ret = bch2_fill_extent(c, info, &prev);  			if (ret) -				break; +				goto err;  		} -		bkey_copy(prev.k, cur.k); -		have_extent = true; - -		bch2_btree_iter_set_pos(trans, &iter, -			POS(iter.pos.inode, iter.pos.offset + sectors)); +		bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); +		prev.flags = cur.flags;  	} -	bch2_trans_iter_exit(trans, &iter); -	if (!ret && have_extent) { +	if (!bkey_deleted(&prev.kbuf.k->k)) {  		bch2_trans_unlock(trans); -		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -				       FIEMAP_EXTENT_LAST); +		prev.flags |= FIEMAP_EXTENT_LAST; +		ret = bch2_fill_extent(c, info, &prev);  	} - +err:  	bch2_trans_put(trans); -	bch2_bkey_buf_exit(&cur, c); -	bch2_bkey_buf_exit(&prev, c); -	return ret < 0 ? ret : 0; +	bch2_bkey_buf_exit(&cur.kbuf, c); +	bch2_bkey_buf_exit(&prev.kbuf, c); + +	return bch2_err_class(ret < 0 ? ret : 0);  }  static const struct vm_operations_struct bch_vm_ops = { @@ -1449,6 +1601,165 @@ static int bch2_open(struct inode *vinode, struct file *file)  	return generic_file_open(vinode, file);  } +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const __maybe_unused unsigned bch_flags_to_uflags[] = { +	[__BCH_INODE_sync]		= FS_SYNC_FL, +	[__BCH_INODE_immutable]		= FS_IMMUTABLE_FL, +	[__BCH_INODE_append]		= FS_APPEND_FL, +	[__BCH_INODE_nodump]		= FS_NODUMP_FL, +	[__BCH_INODE_noatime]		= FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const __maybe_unused unsigned bch_flags_to_xflags[] = { +	[__BCH_INODE_sync]	= FS_XFLAG_SYNC, +	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE, +	[__BCH_INODE_append]	= FS_XFLAG_APPEND, +	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP, +	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME, +}; + +static int bch2_fileattr_get(struct dentry *dentry, +			     struct fileattr *fa) +{ +	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); +	struct bch_fs *c = inode->v.i_sb->s_fs_info; + +	fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); + +	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) +		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; + +	if (bch2_inode_casefold(c, &inode->ei_inode)) +		fa->flags |= FS_CASEFOLD_FL; + +	fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; +	return 0; +} + +struct flags_set { +	unsigned		mask; +	unsigned		flags; +	unsigned		projid; +	bool			set_project; +	bool			set_casefold; +	bool			casefold; +}; + +static int fssetxattr_inode_update_fn(struct btree_trans *trans, +				      struct bch_inode_info *inode, +				      struct bch_inode_unpacked *bi, +				      void *p) +{ +	struct bch_fs *c = trans->c; +	struct flags_set *s = p; + +	/* +	 * We're relying on btree locking here for exclusion with other ioctl +	 * calls - use the flags in the btree (@bi), not inode->i_flags: +	 */ +	if (!S_ISREG(bi->bi_mode) && +	    !S_ISDIR(bi->bi_mode) && +	    (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) +		return -EINVAL; + +	if (s->casefold != bch2_inode_casefold(c, bi)) { +#ifdef CONFIG_UNICODE +		int ret = 0; +		/* Not supported on individual files. */ +		if (!S_ISDIR(bi->bi_mode)) +			return -EOPNOTSUPP; + +		/* +		 * Make sure the dir is empty, as otherwise we'd need to +		 * rehash everything and update the dirent keys. +		 */ +		ret = bch2_empty_dir_trans(trans, inode_inum(inode)); +		if (ret < 0) +			return ret; + +		ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); +		if (ret) +			return ret; + +		bch2_check_set_feature(c, BCH_FEATURE_casefolding); + +		bi->bi_casefold = s->casefold + 1; +		bi->bi_fields_set |= BIT(Inode_opt_casefold); + +#else +		printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); +		return -EOPNOTSUPP; +#endif +	} + +	if (s->set_project) { +		bi->bi_project = s->projid; +		bi->bi_fields_set |= BIT(Inode_opt_project); +	} + +	bi->bi_flags &= ~s->mask; +	bi->bi_flags |= s->flags; + +	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); +	return 0; +} + +static int bch2_fileattr_set(struct mnt_idmap *idmap, +			     struct dentry *dentry, +			     struct fileattr *fa) +{ +	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); +	struct bch_fs *c = inode->v.i_sb->s_fs_info; +	struct flags_set s = {}; +	int ret; + +	if (fa->fsx_valid) { +		fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + +		s.mask = map_defined(bch_flags_to_xflags); +		s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); +		if (fa->fsx_xflags) +			return -EOPNOTSUPP; + +		if (fa->fsx_projid >= U32_MAX) +			return -EINVAL; + +		/* +		 * inode fields accessible via the xattr interface are stored with a +1 +		 * bias, so that 0 means unset: +		 */ +		if ((inode->ei_inode.bi_project || +		     fa->fsx_projid) && +		    inode->ei_inode.bi_project != fa->fsx_projid + 1) { +			s.projid = fa->fsx_projid + 1; +			s.set_project = true; +		} +	} + +	if (fa->flags_valid) { +		s.mask = map_defined(bch_flags_to_uflags); + +		s.set_casefold = true; +		s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; +		fa->flags &= ~FS_CASEFOLD_FL; + +		s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); +		if (fa->flags) +			return -EOPNOTSUPP; +	} + +	mutex_lock(&inode->ei_update_lock); +	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: +		(s.set_project +		 ? bch2_set_projid(c, inode, fa->fsx_projid) +		 : 0) ?: +		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, +			       ATTR_CTIME); +	mutex_unlock(&inode->ei_update_lock); +	return ret; +} +  static const struct file_operations bch_file_operations = {  	.open		= bch2_open,  	.llseek		= bch2_llseek, @@ -1476,6 +1787,8 @@ static const struct inode_operations bch_file_inode_operations = {  	.get_inode_acl	= bch2_get_acl,  	.set_acl	= bch2_set_acl,  #endif +	.fileattr_get	= bch2_fileattr_get, +	.fileattr_set	= bch2_fileattr_set,  };  static const struct inode_operations bch_dir_inode_operations = { @@ -1496,6 +1809,8 @@ static const struct inode_operations bch_dir_inode_operations = {  	.get_inode_acl	= bch2_get_acl,  	.set_acl	= bch2_set_acl,  #endif +	.fileattr_get	= bch2_fileattr_get, +	.fileattr_set	= bch2_fileattr_set,  };  static const struct file_operations bch_dir_file_operations = { @@ -1518,6 +1833,8 @@ static const struct inode_operations bch_symlink_inode_operations = {  	.get_inode_acl	= bch2_get_acl,  	.set_acl	= bch2_set_acl,  #endif +	.fileattr_get	= bch2_fileattr_get, +	.fileattr_set	= bch2_fileattr_set,  };  static const struct inode_operations bch_special_inode_operations = { @@ -1528,6 +1845,8 @@ static const struct inode_operations bch_special_inode_operations = {  	.get_inode_acl	= bch2_get_acl,  	.set_acl	= bch2_set_acl,  #endif +	.fileattr_get	= bch2_fileattr_get, +	.fileattr_set	= bch2_fileattr_set,  };  static const struct address_space_operations bch_address_space_operations = { @@ -2185,10 +2504,9 @@ static int bch2_fs_get_tree(struct fs_context *fc)  	bch2_opts_apply(&c->opts, opts); -	/* -	 * need to initialise sb and set c->vfs_sb _before_ starting fs, -	 * for blk_holder_ops -	 */ +	ret = bch2_fs_start(c); +	if (ret) +		goto err_stop_fs;  	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);  	ret = PTR_ERR_OR_ZERO(sb); @@ -2250,9 +2568,10 @@ got_sb:  	sb->s_shrink->seeks = 0; -	ret = bch2_fs_start(c); -	if (ret) -		goto err_put_super; +#ifdef CONFIG_UNICODE +	sb->s_encoding = c->cf_encoding; +#endif +	generic_set_sb_d_ops(sb);  	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);  	ret = PTR_ERR_OR_ZERO(vinode); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 18308f3d64a1..71d428f376a5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -321,6 +321,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)  	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)  		return false; +	/* +	 * Subvolume roots are special: older versions of subvolume roots may be +	 * disconnected, it's only the newest version that matters. +	 * +	 * We only keep a single dirent pointing to a subvolume root, i.e. +	 * older versions of snapshots will not have a different dirent pointing +	 * to the same subvolume root. +	 * +	 * This is because dirents that point to subvolumes are only visible in +	 * the parent subvolume - versioning is not needed - and keeping them +	 * around would break fsck, because when we're crossing subvolumes we +	 * don't have a consistent snapshot ID to do check the inode <-> dirent +	 * relationships. +	 * +	 * Thus, a subvolume root that's been renamed after a snapshot will have +	 * a disconnected older version - that's expected. +	 * +	 * Note that taking a snapshot always updates the root inode (to update +	 * the dirent backpointer), so a subvolume root inode with +	 * BCH_INODE_has_child_snapshot is never visible. +	 */ +	if (inode->bi_subvol && +	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) +		return false; +  	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);  } @@ -1007,6 +1032,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans,  	if (ret && !bch2_err_matches(ret, ENOENT))  		return ret; +	if ((ret || dirent_points_to_inode_nowarn(d, inode)) && +	    inode->bi_subvol && +	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) { +		/* Older version of a renamed subvolume root: we won't have a +		 * correct dirent for it. That's expected, see +		 * inode_should_reattach(). +		 * +		 * We don't clear the backpointer field when doing the rename +		 * because there might be arbitrarily many versions in older +		 * snapshots. +		 */ +		inode->bi_dir = 0; +		inode->bi_dir_offset = 0; +		*write_inode = true; +		goto out; +	} +  	if (fsck_err_on(ret,  			trans, inode_points_to_missing_dirent,  			"inode points to missing dirent\n%s", @@ -1027,7 +1069,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,  		inode->bi_dir_offset = 0;  		*write_inode = true;  	} - +out:  	ret = 0;  fsck_err:  	bch2_trans_iter_exit(trans, &dirent_iter); @@ -2404,7 +2446,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,  		u32 parent = le32_to_cpu(s.v->fs_path_parent);  		if (darray_u32_has(&subvol_path, parent)) { -			if (fsck_err(c, subvol_loop, "subvolume loop")) +			if (fsck_err(trans, subvol_loop, "subvolume loop"))  				ret = reattach_subvol(trans, s);  			break;  		} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index f82cfbf460d0..c74af15b14f2 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -243,6 +243,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)  	}  } +static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) +{ +	/* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ +	return bi->bi_casefold +		? bi->bi_casefold - 1 +		: c->opts.casefold; +} +  /* i_nlink: */  static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 117110af1e3f..87e193e8ed25 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -103,7 +103,8 @@ struct bch_inode_generation {  	x(bi_parent_subvol,		32)	\  	x(bi_nocow,			8)	\  	x(bi_depth,			32)	\ -	x(bi_inodes_32bit,		8) +	x(bi_inodes_32bit,		8)	\ +	x(bi_casefold,			8)  /* subset of BCH_INODE_FIELDS */  #define BCH_INODE_OPTS()			\ @@ -117,7 +118,8 @@ struct bch_inode_generation {  	x(background_target,		16)	\  	x(erasure_code,			16)	\  	x(nocow,			8)	\ -	x(inodes_32bit,			8) +	x(inodes_32bit,			8)	\ +	x(casefold,			8)  enum inode_opt_id {  #define x(name, ...)				\ @@ -137,8 +139,7 @@ enum inode_opt_id {  	x(i_sectors_dirty,		6)	\  	x(unlinked,			7)	\  	x(backptr_untrusted,		8)	\ -	x(has_child_snapshot,		9)	\ -	x(casefolded,			10) +	x(has_child_snapshot,		9)  /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 417bb0c7bbfa..def4a26a3b45 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -487,6 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)  		.inum	= rbio->read_pos.inode,  	};  	struct bch_io_failures failed = { .nr = 0 }; +	int orig_error = rbio->ret; +  	struct btree_trans *trans = bch2_trans_get(c);  	trace_io_read_retry(&rbio->bio); @@ -519,7 +521,9 @@ static void bch2_rbio_retry(struct work_struct *work)  	if (ret) {  		rbio->ret = ret;  		rbio->bio.bi_status = BLK_STS_IOERR; -	} else { +	} else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && +		   orig_error != -BCH_ERR_data_read_ptr_stale_race && +		   !failed.nr) {  		struct printbuf buf = PRINTBUF;  		lockrestart_do(trans, @@ -977,7 +981,8 @@ retry_pick:  		goto err;  	} -	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { +	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && +	    !c->chacha20_key_set) {  		struct printbuf buf = PRINTBUF;  		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);  		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  "); @@ -1344,14 +1349,16 @@ err:  	bch2_trans_iter_exit(trans, &iter); -	if (ret) { -		struct printbuf buf = PRINTBUF; -		lockrestart_do(trans, -			bch2_inum_offset_err_msg_trans(trans, &buf, inum, -						       bvec_iter.bi_sector << 9)); -		prt_printf(&buf, "read error: %s", bch2_err_str(ret)); -		bch_err_ratelimited(c, "%s", buf.buf); -		printbuf_exit(&buf); +	if (unlikely(ret)) { +		if (ret != -BCH_ERR_extent_poisoned) { +			struct printbuf buf = PRINTBUF; +			lockrestart_do(trans, +				       bch2_inum_offset_err_msg_trans(trans, &buf, inum, +								      bvec_iter.bi_sector << 9)); +			prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); +			bch_err_ratelimited(c, "%s", buf.buf); +			printbuf_exit(&buf); +		}  		rbio->bio.bi_status	= BLK_STS_IOERR;  		rbio->ret		= ret; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index a418fa62f09d..c1237da079ed 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -255,6 +255,27 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,  	}  	if (i_sectors_delta) { +		s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); +		if (unlikely(bi_sectors + i_sectors_delta < 0)) { +			struct bch_fs *c = trans->c; +			struct printbuf buf = PRINTBUF; +			bch2_log_msg_start(c, &buf); +			prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", +				   extent_iter->pos.inode, bi_sectors, i_sectors_delta); + +			bool repeat = false, print = false, suppress = false; +			bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf, +					    &repeat, &print, &suppress); +			if (print) +				bch2_print_str(c, buf.buf); +			printbuf_exit(&buf); + +			if (i_sectors_delta < 0) +				i_sectors_delta = -bi_sectors; +			else +				i_sectors_delta = 0; +		} +  		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);  		inode_update_flags = 0;  	} diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d8f74b6d0a75..bb45d3634194 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -281,7 +281,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t  	sectors = vstruct_blocks_plus(buf->data, c->block_bits,  				      buf->u64s_reserved) << c->block_bits; -	BUG_ON(sectors > buf->sectors); +	if (unlikely(sectors > buf->sectors)) { +		struct printbuf err = PRINTBUF; +		err.atomic++; + +		prt_printf(&err, "journal entry overran reserved space: %u > %u\n", +			   sectors, buf->sectors); +		prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", +			   le32_to_cpu(buf->data->u64s), buf->u64s_reserved, +			   j->cur_entry_u64s, +			   c->block_bits); +		prt_printf(&err, "fatal error - emergency read only"); +		bch2_journal_halt_locked(j); + +		bch_err(c, "%s", err.buf); +		printbuf_exit(&err); +		return; +	} +  	buf->sectors = sectors;  	/* @@ -1462,8 +1479,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)  		j->last_empty_seq = cur_seq - 1; /* to match j->seq */  	spin_lock(&j->lock); - -	set_bit(JOURNAL_running, &j->flags);  	j->last_flush_write = jiffies;  	j->reservations.idx = journal_cur_seq(j); @@ -1474,6 +1489,21 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)  	return 0;  } +void bch2_journal_set_replay_done(struct journal *j) +{ +	/* +	 * journal_space_available must happen before setting JOURNAL_running +	 * JOURNAL_running must happen before JOURNAL_replay_done +	 */ +	spin_lock(&j->lock); +	bch2_journal_space_available(j); + +	set_bit(JOURNAL_need_flush_write, &j->flags); +	set_bit(JOURNAL_running, &j->flags); +	set_bit(JOURNAL_replay_done, &j->flags); +	spin_unlock(&j->lock); +} +  /* init/exit: */  void bch2_dev_journal_exit(struct bch_dev *ca) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 47828771f9c2..641e20c05a14 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -437,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)  struct bch_dev; -static inline void bch2_journal_set_replay_done(struct journal *j) -{ -	BUG_ON(!test_bit(JOURNAL_running, &j->flags)); -	set_bit(JOURNAL_replay_done, &j->flags); -} -  void bch2_journal_unblock(struct journal *);  void bch2_journal_block(struct journal *);  struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); @@ -459,6 +453,7 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *);  void bch2_fs_journal_stop(struct journal *);  int bch2_fs_journal_start(struct journal *, u64); +void bch2_journal_set_replay_done(struct journal *);  void bch2_dev_journal_exit(struct bch_dev *);  int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1b7961f4f609..ded18a94ed02 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -19,6 +19,7 @@  #include <linux/ioprio.h>  #include <linux/string_choices.h> +#include <linux/sched/sysctl.h>  void bch2_journal_pos_from_member_info_set(struct bch_fs *c)  { @@ -1262,7 +1263,8 @@ int bch2_journal_read(struct bch_fs *c,  			degraded = true;  	} -	closure_sync(&jlist.cl); +	while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) +		;  	if (jlist.ret)  		return jlist.ret; @@ -1460,7 +1462,7 @@ fsck_err:  static void journal_advance_devs_to_next_bucket(struct journal *j,  						struct dev_alloc_list *devs, -						unsigned sectors, u64 seq) +						unsigned sectors, __le64 seq)  {  	struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1782,7 +1784,7 @@ static CLOSURE_CALLBACK(journal_write_submit)  		struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);  		if (!ca) {  			/* XXX: fix this */ -			bch_err(c, "missing device for journal write\n"); +			bch_err(c, "missing device %u for journal write", ptr->dev);  			continue;  		} diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 5d1547aa118a..cc00b0fc40d8 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@  #include <linux/kthread.h>  #include <linux/sched/mm.h> +static bool __should_discard_bucket(struct journal *, struct journal_device *); +  /* Free space calculations: */  static unsigned journal_space_from(struct journal_device *ja, @@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)  		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)  			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -		if (ja->discard_idx != ja->dirty_idx_ondisk) -			can_discard = true; +		can_discard |= __should_discard_bucket(j, ja);  		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);  		nr_online++; @@ -252,7 +253,10 @@ void bch2_journal_space_available(struct journal *j)  	bch2_journal_set_watermark(j);  out: -	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0; +	j->cur_entry_sectors	= !ret +		? round_down(j->space[journal_space_discarded].next_entry, +			     block_sectors(c)) +		: 0;  	j->cur_entry_error	= ret;  	if (!ret) @@ -261,12 +265,19 @@ out:  /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)  { -	bool ret; +	unsigned min_free = max(4, ja->nr / 8); + +	return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < +		min_free && +		ja->discard_idx != ja->dirty_idx_ondisk; +} +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{  	spin_lock(&j->lock); -	ret = ja->discard_idx != ja->dirty_idx_ondisk; +	bool ret = __should_discard_bucket(j, ja);  	spin_unlock(&j->lock);  	return ret; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fc396b9fa754..dfdbb9259985 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -784,7 +784,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,  		goto err;  	ret = bch2_btree_write_buffer_tryflush(trans); -	bch_err_msg(c, ret, "flushing btree write buffer"); +	if (!bch2_err_matches(ret, EROFS)) +		bch_err_msg(c, ret, "flushing btree write buffer");  	if (ret)  		goto err; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 159410c50861..96873372b516 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -356,6 +356,13 @@ static int bch2_copygc_thread(void *arg)  	set_freezable(); +	/* +	 * Data move operations can't run until after check_snapshots has +	 * completed, and bch2_snapshot_is_ancestor() is available. +	 */ +	kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || +			       kthread_should_stop()); +  	bch2_move_stats_init(&move_stats, "copygc");  	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,  			      writepoint_ptr(&c->copygc_write_point), diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index ea181fef5bc9..d1885cf67a45 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -5,6 +5,15 @@  unsigned long bch2_copygc_wait_amount(struct bch_fs *);  void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); +static inline void bch2_copygc_wakeup(struct bch_fs *c) +{ +	rcu_read_lock(); +	struct task_struct *p = rcu_dereference(c->copygc_thread); +	if (p) +		wake_up_process(p); +	rcu_read_unlock(); +} +  void bch2_copygc_stop(struct bch_fs *);  int bch2_copygc_start(struct bch_fs *);  void bch2_fs_copygc_init(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 0d65ea96f7a2..52c58c6d53d2 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -47,10 +47,6 @@ int bch2_create_trans(struct btree_trans *trans,  	if (ret)  		goto err; -	/* Inherit casefold state from parent. */ -	if (S_ISDIR(mode)) -		new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded; -  	if (!(flags & BCH_CREATE_SNAPSHOT)) {  		/* Normal create path - allocate a new inode: */  		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); @@ -347,6 +343,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,  	bool ret = false;  	for (id = 0; id < Inode_opt_nr; id++) { +		if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) +			continue; +  		/* Skip attributes that were explicitly set on this inode */  		if (dst_u->bi_fields_set & (1 << id))  			continue; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 4d06313076ff..dfb14810124c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -228,6 +228,11 @@ enum fsck_err_opts {  	  OPT_BOOL(),							\  	  BCH_SB_ERASURE_CODE,		false,				\  	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\ +	x(casefold,			u8,				\ +	  OPT_FS|OPT_INODE|OPT_FORMAT,					\ +	  OPT_BOOL(),							\ +	  BCH_SB_CASEFOLD,		false,				\ +	  NULL,		"Dirent lookups are casefolded")		\  	x(inodes_32bit,			u8,				\  	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\  	  OPT_BOOL(),							\ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c63fa53f30d2..623273556aa9 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -262,7 +262,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)  	int ret = bch2_trans_commit_do(c, NULL, NULL,  				       BCH_TRANS_COMMIT_no_enospc,  			    bch2_set_rebalance_needs_scan_trans(trans, inum)); -	rebalance_wakeup(c); +	bch2_rebalance_wakeup(c);  	return ret;  } @@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,  					   struct btree_iter *iter,  					   struct bkey_s_c k)  { -	if (!bch2_bkey_rebalance_opts(k)) +	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))  		return 0;  	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); @@ -581,6 +581,13 @@ static int bch2_rebalance_thread(void *arg)  	set_freezable(); +	/* +	 * Data move operations can't run until after check_snapshots has +	 * completed, and bch2_snapshot_is_ancestor() is available. +	 */ +	kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || +			       kthread_should_stop()); +  	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,  			      writepoint_ptr(&c->rebalance_write_point),  			      true); @@ -664,7 +671,7 @@ void bch2_rebalance_stop(struct bch_fs *c)  	c->rebalance.thread = NULL;  	if (p) { -		/* for sychronizing with rebalance_wakeup() */ +		/* for sychronizing with bch2_rebalance_wakeup() */  		synchronize_rcu();  		kthread_stop(p); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 62a3859d3823..e5e8eb4a2dd1 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);  int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);  int bch2_set_fs_needs_rebalance(struct bch_fs *); -static inline void rebalance_wakeup(struct bch_fs *c) +static inline void bch2_rebalance_wakeup(struct bch_fs *c)  {  	struct task_struct *p; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 79fd18a5a07c..d6c4ef819d40 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -18,6 +18,7 @@  #include "journal_seq_blacklist.h"  #include "logged_ops.h"  #include "move.h" +#include "movinggc.h"  #include "namei.h"  #include "quota.h"  #include "rebalance.h" @@ -389,9 +390,9 @@ int bch2_journal_replay(struct bch_fs *c)  	 * Now, replay any remaining keys in the order in which they appear in  	 * the journal, unpinning those journal entries as we go:  	 */ -	sort(keys_sorted.data, keys_sorted.nr, -	     sizeof(keys_sorted.data[0]), -	     journal_sort_seq_cmp, NULL); +	sort_nonatomic(keys_sorted.data, keys_sorted.nr, +		       sizeof(keys_sorted.data[0]), +		       journal_sort_seq_cmp, NULL);  	darray_for_each(keys_sorted, kp) {  		cond_resched(); @@ -1125,14 +1126,17 @@ int bch2_fs_initialize(struct bch_fs *c)  	 * journal_res_get() will crash if called before this has  	 * set up the journal.pin FIFO and journal.cur pointer:  	 */ -	bch2_fs_journal_start(&c->journal, 1); -	set_bit(BCH_FS_accounting_replay_done, &c->flags); -	bch2_journal_set_replay_done(&c->journal); +	ret = bch2_fs_journal_start(&c->journal, 1); +	if (ret) +		goto err;  	ret = bch2_fs_read_write_early(c);  	if (ret)  		goto err; +	set_bit(BCH_FS_accounting_replay_done, &c->flags); +	bch2_journal_set_replay_done(&c->journal); +  	for_each_member_device(c, ca) {  		ret = bch2_dev_usage_init(ca, false);  		if (ret) { @@ -1191,6 +1195,9 @@ int bch2_fs_initialize(struct bch_fs *c)  	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; +	bch2_copygc_wakeup(c); +	bch2_rebalance_wakeup(c); +  	if (enabled_qtypes(c)) {  		ret = bch2_fs_quota_read(c);  		if (ret) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 593ff142530d..22f72bb5b853 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -12,6 +12,7 @@  #include "journal.h"  #include "lru.h"  #include "logged_ops.h" +#include "movinggc.h"  #include "rebalance.h"  #include "recovery.h"  #include "recovery_passes.h" @@ -262,49 +263,52 @@ int bch2_run_recovery_passes(struct bch_fs *c)  	 */  	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; -	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { -		c->next_recovery_pass = c->curr_recovery_pass + 1; +	spin_lock_irq(&c->recovery_pass_lock); -		spin_lock_irq(&c->recovery_pass_lock); +	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { +		unsigned prev_done = c->recovery_pass_done;  		unsigned pass = c->curr_recovery_pass; +		c->next_recovery_pass = pass + 1; +  		if (c->opts.recovery_pass_last && -		    c->curr_recovery_pass > c->opts.recovery_pass_last) { -			spin_unlock_irq(&c->recovery_pass_lock); +		    c->curr_recovery_pass > c->opts.recovery_pass_last)  			break; -		} -		if (!should_run_recovery_pass(c, pass)) { -			c->curr_recovery_pass++; -			c->recovery_pass_done = max(c->recovery_pass_done, pass); +		if (should_run_recovery_pass(c, pass)) {  			spin_unlock_irq(&c->recovery_pass_lock); -			continue; -		} -		spin_unlock_irq(&c->recovery_pass_lock); - -		ret =   bch2_run_recovery_pass(c, pass) ?: -			bch2_journal_flush(&c->journal); - -		if (!ret && !test_bit(BCH_FS_error, &c->flags)) -			bch2_clear_recovery_pass_required(c, pass); - -		spin_lock_irq(&c->recovery_pass_lock); -		if (c->next_recovery_pass < c->curr_recovery_pass) { -			/* -			 * bch2_run_explicit_recovery_pass() was called: we -			 * can't always catch -BCH_ERR_restart_recovery because -			 * it may have been called from another thread (btree -			 * node read completion) -			 */ -			ret = 0; -			c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); -		} else { -			c->recovery_passes_complete |= BIT_ULL(pass); -			c->recovery_pass_done = max(c->recovery_pass_done, pass); +			ret =   bch2_run_recovery_pass(c, pass) ?: +				bch2_journal_flush(&c->journal); + +			if (!ret && !test_bit(BCH_FS_error, &c->flags)) +				bch2_clear_recovery_pass_required(c, pass); +			spin_lock_irq(&c->recovery_pass_lock); + +			if (c->next_recovery_pass < c->curr_recovery_pass) { +				/* +				 * bch2_run_explicit_recovery_pass() was called: we +				 * can't always catch -BCH_ERR_restart_recovery because +				 * it may have been called from another thread (btree +				 * node read completion) +				 */ +				ret = 0; +				c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); +			} else { +				c->recovery_passes_complete |= BIT_ULL(pass); +				c->recovery_pass_done = max(c->recovery_pass_done, pass); +			}  		} +  		c->curr_recovery_pass = c->next_recovery_pass; -		spin_unlock_irq(&c->recovery_pass_lock); + +		if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && +		    c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { +			bch2_copygc_wakeup(c); +			bch2_rebalance_wakeup(c); +		}  	} +	spin_unlock_irq(&c->recovery_pass_lock); +  	return ret;  } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index acb5d845841e..badd0e17ada5 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -20,6 +20,10 @@   * x(version, recovery_passes, errors...)   */  #define UPGRADE_TABLE()						\ +	x(snapshot_2,						\ +	  RECOVERY_PASS_ALL_FSCK,				\ +	  BCH_FSCK_ERR_subvol_root_wrong_bi_subvol,		\ +	  BCH_FSCK_ERR_subvol_not_master_and_not_snapshot)	\  	x(backpointers,						\  	  RECOVERY_PASS_ALL_FSCK)				\  	x(inode_v3,						\ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 5d43e3504386..3b69a924086f 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -46,7 +46,7 @@ enum bch_fsck_flags {  	x(btree_node_unsupported_version,			 34,	0)		\  	x(btree_node_bset_older_than_sb_min,			 35,	0)		\  	x(btree_node_bset_newer_than_sb,			 36,	0)		\ -	x(btree_node_data_missing,				 37,	0)		\ +	x(btree_node_data_missing,				 37,	FSCK_AUTOFIX)	\  	x(btree_node_bset_after_end,				 38,	0)		\  	x(btree_node_replicas_sectors_written_mismatch,		 39,	0)		\  	x(btree_node_replicas_data_mismatch,			 40,	0)		\ @@ -205,9 +205,9 @@ enum bch_fsck_flags {  	x(snapshot_bad_depth,					184,	0)		\  	x(snapshot_bad_skiplist,				185,	0)		\  	x(subvol_pos_bad,					186,	0)		\ -	x(subvol_not_master_and_not_snapshot,			187,	0)		\ +	x(subvol_not_master_and_not_snapshot,			187,	FSCK_AUTOFIX)	\  	x(subvol_to_missing_root,				188,	0)		\ -	x(subvol_root_wrong_bi_subvol,				189,	0)		\ +	x(subvol_root_wrong_bi_subvol,				189,	FSCK_AUTOFIX)	\  	x(bkey_in_missing_snapshot,				190,	0)		\  	x(inode_pos_inode_nonzero,				191,	0)		\  	x(inode_pos_blockdev_range,				192,	0)		\ @@ -236,6 +236,9 @@ enum bch_fsck_flags {  	x(inode_has_child_snapshots_wrong,			287,	0)		\  	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\  	x(inode_journal_seq_in_future,				299,	FSCK_AUTOFIX)	\ +	x(inode_i_sectors_underflow,				312,	FSCK_AUTOFIX)	\ +	x(vfs_inode_i_blocks_underflow,				311,	FSCK_AUTOFIX)	\ +	x(vfs_inode_i_blocks_not_zero_at_truncate,		313,	FSCK_AUTOFIX)	\  	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\  	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\  	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\ @@ -290,8 +293,8 @@ enum bch_fsck_flags {  	x(btree_node_bkey_bad_u64s,				260,	0)		\  	x(btree_node_topology_empty_interior_node,		261,	0)		\  	x(btree_ptr_v2_min_key_bad,				262,	0)		\ -	x(btree_root_unreadable_and_scan_found_nothing,		263,	0)		\ -	x(snapshot_node_missing,				264,	0)		\ +	x(btree_root_unreadable_and_scan_found_nothing,		263,	FSCK_AUTOFIX)	\ +	x(snapshot_node_missing,				264,	FSCK_AUTOFIX)	\  	x(dup_backpointer_to_bad_csum_extent,			265,	0)		\  	x(btree_bitmap_not_marked,				266,	FSCK_AUTOFIX)	\  	x(sb_clean_entry_overrun,				267,	0)		\ @@ -317,7 +320,9 @@ enum bch_fsck_flags {  	x(directory_size_mismatch,				303,	FSCK_AUTOFIX)	\  	x(dirent_cf_name_too_big,				304,	0)		\  	x(dirent_stray_data_after_cf_name,			305,	0)		\ -	x(MAX,							308,	0) +	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\ +	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\ +	x(MAX,							314,	0)  enum bch_sb_error_id {  #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 116131f95815..72779912939b 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -15,9 +15,11 @@ void bch2_dev_missing(struct bch_fs *c, unsigned dev)  		bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);  } -void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)  { -	bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); +	bch2_fs_inconsistent(ca->fs, +		"pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", +		bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);  }  #define x(t, n, ...) [n] = #t, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 06bb41a3f360..42786657522c 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -249,20 +249,23 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)  static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)  {  	struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); -	if (ca && !bucket_valid(ca, bucket.offset)) { +	if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {  		bch2_dev_put(ca);  		ca = NULL;  	}  	return ca;  } -void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); +void bch2_dev_bucket_missing(struct bch_dev *, u64);  static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)  { -	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); -	if (!ca) -		bch2_dev_bucket_missing(c, bucket); +	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); +	if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { +		bch2_dev_bucket_missing(ca, bucket.offset); +		bch2_dev_put(ca); +		ca = NULL; +	}  	return ca;  } diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index b7de29aed839..fec569c7deb1 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -396,7 +396,7 @@ u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)  	u32 subvol = 0, s;  	rcu_read_lock(); -	while (id) { +	while (id && bch2_snapshot_exists(c, id)) {  		s = snapshot_t(c, id)->subvol;  		if (s && (!subvol || s < subvol)) diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 09a354a26c3b..0c1a00539bd1 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -33,7 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)  struct bch_hash_info {  	u8			type; -	struct unicode_map 	*cf_encoding; +	struct unicode_map	*cf_encoding;  	/*  	 * For crc32 or crc64 string hashes the first key value of  	 * the siphash_key (k0) is used as the key. @@ -44,11 +44,10 @@ struct bch_hash_info {  static inline struct bch_hash_info  bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)  { -	/* XXX ick */  	struct bch_hash_info info = {  		.type = INODE_STR_HASH(bi),  #ifdef CONFIG_UNICODE -		.cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL, +		.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,  #endif  		.siphash_key = { .k0 = bi->bi_hash_seed }  	}; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 5537283d0bea..d0209f7658bb 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -6,6 +6,7 @@  #include "errcode.h"  #include "error.h"  #include "fs.h" +#include "recovery_passes.h"  #include "snapshot.h"  #include "subvolume.h" @@ -44,8 +45,8 @@ static int check_subvol(struct btree_trans *trans,  	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);  	if (bch2_err_matches(ret, ENOENT)) -		bch_err(c, "subvolume %llu points to nonexistent snapshot %u", -			k.k->p.offset, snapid); +		return bch2_run_explicit_recovery_pass(c, +					BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;  	if (ret)  		return ret; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e27422b6d9c6..cb5d960aed92 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -73,14 +73,30 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v  		? 0  		: -BCH_ERR_may_not_use_incompat_feature; +	mutex_lock(&c->sb_lock);  	if (!ret) { -		mutex_lock(&c->sb_lock);  		SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,  			max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));  		bch2_write_super(c); -		mutex_unlock(&c->sb_lock); +	} else { +		darray_for_each(c->incompat_versions_requested, i) +			if (version == *i) +				goto out; + +		darray_push(&c->incompat_versions_requested, version); +		struct printbuf buf = PRINTBUF; +		prt_str(&buf, "requested incompat feature "); +		bch2_version_to_text(&buf, version); +		prt_str(&buf, " currently not enabled"); +		prt_printf(&buf, "\n  set version_upgrade=incompat to enable"); + +		bch_notice(c, "%s", buf.buf); +		printbuf_exit(&buf);  	} +out: +	mutex_unlock(&c->sb_lock); +  	return ret;  } @@ -1086,7 +1102,8 @@ int bch2_write_super(struct bch_fs *c)  		prt_str(&buf, ")");  		bch2_fs_fatal_error(c, ": %s", buf.buf);  		printbuf_exit(&buf); -		return -BCH_ERR_sb_not_downgraded; +		ret = -BCH_ERR_sb_not_downgraded; +		goto out;  	}  	darray_for_each(online_devices, ca) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a58edde43bee..84a37d971ffd 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -70,14 +70,10 @@  #include <linux/percpu.h>  #include <linux/random.h>  #include <linux/sysfs.h> -#include <crypto/hash.h>  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");  MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: chacha20"); -MODULE_SOFTDEP("pre: poly1305"); -MODULE_SOFTDEP("pre: xxhash");  const char * const bch2_fs_flag_strs[] = {  #define x(n)		#n, @@ -381,6 +377,11 @@ void bch2_fs_read_only(struct bch_fs *c)  		bch_verbose(c, "marking filesystem clean");  		bch2_fs_mark_clean(c);  	} else { +		/* Make sure error counts/counters are persisted */ +		mutex_lock(&c->sb_lock); +		bch2_write_super(c); +		mutex_unlock(&c->sb_lock); +  		bch_verbose(c, "done going read-only, filesystem not clean");  	}  } @@ -422,32 +423,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)  	return ret;  } -static int bch2_fs_read_write_late(struct bch_fs *c) -{ -	int ret; - -	/* -	 * Data move operations can't run until after check_snapshots has -	 * completed, and bch2_snapshot_is_ancestor() is available. -	 * -	 * Ideally we'd start copygc/rebalance earlier instead of waiting for -	 * all of recovery/fsck to complete: -	 */ -	ret = bch2_copygc_start(c); -	if (ret) { -		bch_err(c, "error starting copygc thread"); -		return ret; -	} - -	ret = bch2_rebalance_start(c); -	if (ret) { -		bch_err(c, "error starting rebalance thread"); -		return ret; -	} - -	return 0; -} -  static int __bch2_fs_read_write(struct bch_fs *c, bool early)  {  	int ret; @@ -470,29 +445,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)  	clear_bit(BCH_FS_clean_shutdown, &c->flags); +	__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { +		bch2_dev_allocator_add(c, ca); +		percpu_ref_reinit(&ca->io_ref[WRITE]); +	} +	bch2_recalc_capacity(c); +  	/*  	 * First journal write must be a flush write: after a clean shutdown we  	 * don't read the journal, so the first journal write may end up  	 * overwriting whatever was there previously, and there must always be  	 * at least one non-flush write in the journal or recovery will fail:  	 */ +	spin_lock(&c->journal.lock);  	set_bit(JOURNAL_need_flush_write, &c->journal.flags);  	set_bit(JOURNAL_running, &c->journal.flags); - -	__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { -		bch2_dev_allocator_add(c, ca); -		percpu_ref_reinit(&ca->io_ref[WRITE]); -	} -	bch2_recalc_capacity(c); +	bch2_journal_space_available(&c->journal); +	spin_unlock(&c->journal.lock);  	ret = bch2_fs_mark_dirty(c);  	if (ret)  		goto err; -	spin_lock(&c->journal.lock); -	bch2_journal_space_available(&c->journal); -	spin_unlock(&c->journal.lock); -  	ret = bch2_journal_reclaim_start(&c->journal);  	if (ret)  		goto err; @@ -508,10 +482,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)  		atomic_long_inc(&c->writes[i]);  	}  #endif -	if (!early) { -		ret = bch2_fs_read_write_late(c); -		if (ret) -			goto err; + +	ret = bch2_copygc_start(c); +	if (ret) { +		bch_err_msg(c, ret, "error starting copygc thread"); +		goto err; +	} + +	ret = bch2_rebalance_start(c); +	if (ret) { +		bch_err_msg(c, ret, "error starting rebalance thread"); +		goto err;  	}  	bch2_do_discards(c); @@ -555,8 +536,13 @@ static void __bch2_fs_free(struct bch_fs *c)  	for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)  		bch2_time_stats_exit(&c->times[i]); +#ifdef CONFIG_UNICODE +	utf8_unload(c->cf_encoding); +#endif +  	bch2_find_btree_nodes_exit(&c->found_btree_nodes);  	bch2_free_pending_node_rewrites(c); +	bch2_free_fsck_errs(c);  	bch2_fs_accounting_exit(c);  	bch2_fs_sb_errors_exit(c);  	bch2_fs_counters_exit(c); @@ -593,6 +579,7 @@ static void __bch2_fs_free(struct bch_fs *c)  		free_percpu(c->online_reserved);  	} +	darray_exit(&c->incompat_versions_requested);  	darray_exit(&c->btree_roots_extra);  	free_percpu(c->pcpu);  	free_percpu(c->usage); @@ -845,25 +832,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)  	if (ret)  		goto err; -#ifdef CONFIG_UNICODE -	/* Default encoding until we can potentially have more as an option. */ -	c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); -	if (IS_ERR(c->cf_encoding)) { -		printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", -			unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), -			unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), -			unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); -		ret = -EINVAL; -		goto err; -	} -#else -	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { -		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); -		ret = -EINVAL; -		goto err; -	} -#endif -  	pr_uuid(&name, c->sb.user_uuid.b);  	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;  	if (ret) @@ -963,6 +931,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)  	if (ret)  		goto err; +#ifdef CONFIG_UNICODE +	/* Default encoding until we can potentially have more as an option. */ +	c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); +	if (IS_ERR(c->cf_encoding)) { +		printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", +			unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), +			unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), +			unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +		ret = -EINVAL; +		goto err; +	} +	bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", +		 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), +		 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), +		 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +#else +	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { +		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); +		ret = -EINVAL; +		goto err; +	} +#endif +  	for (i = 0; i < c->sb.nr_devices; i++) {  		if (!bch2_member_exists(c->disk_sb.sb, i))  			continue; @@ -1002,12 +993,6 @@ static void print_mount_opts(struct bch_fs *c)  	prt_str(&p, "starting version ");  	bch2_version_to_text(&p, c->sb.version); -	if (c->opts.read_only) { -		prt_str(&p, " opts="); -		first = false; -		prt_printf(&p, "ro"); -	} -  	for (i = 0; i < bch2_opts_nr; i++) {  		const struct bch_option *opt = &bch2_opt_table[i];  		u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1023,10 +1008,49 @@ static void print_mount_opts(struct bch_fs *c)  		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);  	} +	if (c->sb.version_incompat_allowed != c->sb.version) { +		prt_printf(&p, "\n  allowing incompatible features above "); +		bch2_version_to_text(&p, c->sb.version_incompat_allowed); +	} +  	bch_info(c, "%s", p.buf);  	printbuf_exit(&p);  } +static bool bch2_fs_may_start(struct bch_fs *c) +{ +	struct bch_dev *ca; +	unsigned i, flags = 0; + +	if (c->opts.very_degraded) +		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + +	if (c->opts.degraded) +		flags |= BCH_FORCE_IF_DEGRADED; + +	if (!c->opts.degraded && +	    !c->opts.very_degraded) { +		mutex_lock(&c->sb_lock); + +		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { +			if (!bch2_member_exists(c->disk_sb.sb, i)) +				continue; + +			ca = bch2_dev_locked(c, i); + +			if (!bch2_dev_is_online(ca) && +			    (ca->mi.state == BCH_MEMBER_STATE_rw || +			     ca->mi.state == BCH_MEMBER_STATE_ro)) { +				mutex_unlock(&c->sb_lock); +				return false; +			} +		} +		mutex_unlock(&c->sb_lock); +	} + +	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); +} +  int bch2_fs_start(struct bch_fs *c)  {  	time64_t now = ktime_get_real_seconds(); @@ -1034,6 +1058,9 @@ int bch2_fs_start(struct bch_fs *c)  	print_mount_opts(c); +	if (!bch2_fs_may_start(c)) +		return -BCH_ERR_insufficient_devices_to_start; +  	down_write(&c->state_lock);  	mutex_lock(&c->sb_lock); @@ -1086,13 +1113,10 @@ int bch2_fs_start(struct bch_fs *c)  	wake_up(&c->ro_ref_wait);  	down_write(&c->state_lock); -	if (c->opts.read_only) { +	if (c->opts.read_only)  		bch2_fs_read_only(c); -	} else { -		ret = !test_bit(BCH_FS_rw, &c->flags) -			? bch2_fs_read_write(c) -			: bch2_fs_read_write_late(c); -	} +	else if (!test_bit(BCH_FS_rw, &c->flags)) +		ret = bch2_fs_read_write(c);  	up_write(&c->state_lock);  err: @@ -1504,7 +1528,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)  	printbuf_exit(&name); -	rebalance_wakeup(c); +	bch2_rebalance_wakeup(c);  	return 0;  } @@ -1563,40 +1587,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,  	}  } -static bool bch2_fs_may_start(struct bch_fs *c) -{ -	struct bch_dev *ca; -	unsigned i, flags = 0; - -	if (c->opts.very_degraded) -		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - -	if (c->opts.degraded) -		flags |= BCH_FORCE_IF_DEGRADED; - -	if (!c->opts.degraded && -	    !c->opts.very_degraded) { -		mutex_lock(&c->sb_lock); - -		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -			if (!bch2_member_exists(c->disk_sb.sb, i)) -				continue; - -			ca = bch2_dev_locked(c, i); - -			if (!bch2_dev_is_online(ca) && -			    (ca->mi.state == BCH_MEMBER_STATE_rw || -			     ca->mi.state == BCH_MEMBER_STATE_ro)) { -				mutex_unlock(&c->sb_lock); -				return false; -			} -		} -		mutex_unlock(&c->sb_lock); -	} - -	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); -} -  static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)  {  	bch2_dev_io_ref_stop(ca, WRITE); @@ -1650,7 +1640,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,  	if (new_state == BCH_MEMBER_STATE_rw)  		__bch2_dev_read_write(c, ca); -	rebalance_wakeup(c); +	bch2_rebalance_wakeup(c);  	return ret;  } @@ -1767,7 +1757,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)  	up_write(&c->state_lock);  	return 0;  err: -	if (ca->mi.state == BCH_MEMBER_STATE_rw && +	if (test_bit(BCH_FS_rw, &c->flags) && +	    ca->mi.state == BCH_MEMBER_STATE_rw &&  	    !percpu_ref_is_zero(&ca->io_ref[READ]))  		__bch2_dev_read_write(c, ca);  	up_write(&c->state_lock); @@ -2231,11 +2222,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,  	}  	up_write(&c->state_lock); -	if (!bch2_fs_may_start(c)) { -		ret = -BCH_ERR_insufficient_devices_to_start; -		goto err_print; -	} -  	if (!c->opts.nostart) {  		ret = bch2_fs_start(c);  		if (ret) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index e5f003c29369..82ee333ddd21 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -654,11 +654,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,  		bch2_set_rebalance_needs_scan(c, 0);  	if (v && id == Opt_rebalance_enabled) -		rebalance_wakeup(c); +		bch2_rebalance_wakeup(c); -	if (v && id == Opt_copygc_enabled && -	    c->copygc_thread) -		wake_up_process(c->copygc_thread); +	if (v && id == Opt_copygc_enabled) +		bch2_copygc_wakeup(c);  	if (id == Opt_discard && !ca) {  		mutex_lock(&c->sb_lock); diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index c265b102267a..782a05fe7656 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)   */  static int test_peek_end(struct bch_fs *c, u64 nr)  { +	delete_test_keys(c); +  	struct btree_trans *trans = bch2_trans_get(c);  	struct btree_iter iter;  	struct bkey_s_c k; @@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)  static int test_peek_end_extents(struct bch_fs *c, u64 nr)  { +	delete_test_keys(c); +  	struct btree_trans *trans = bch2_trans_get(c);  	struct btree_iter iter;  	struct bkey_s_c k; diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index dea73bc1cb51..314a24d15d4e 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki  	struct stdio_buf *buf = &stdio->output;  	unsigned long flags;  	ssize_t ret; -  again: +	if (stdio->done) +		return -EPIPE; +  	spin_lock_irqsave(&buf->lock, flags);  	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);  	spin_unlock_irqrestore(&buf->lock, flags); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 6ba5071ab6dd..3e52c7f8ddd2 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -739,4 +739,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len)  		*--dst = *src++;  } +#define set_flags(_map, _in, _out)					\ +do {									\ +	unsigned _i;							\ +									\ +	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\ +		if ((_in) & (1 << _i))					\ +			(_out) |= _map[_i];				\ +		else							\ +			(_out) &= ~_map[_i];				\ +} while (0) + +#define map_flags(_map, _in)						\ +({									\ +	unsigned _out = 0;						\ +									\ +	set_flags(_map, _in, _out);					\ +	_out;								\ +}) + +#define map_flags_rev(_map, _in)					\ +({									\ +	unsigned _i, _out = 0;						\ +									\ +	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\ +		if ((_in) & _map[_i]) {					\ +			(_out) |= 1 << _i;				\ +			(_in) &= ~_map[_i];				\ +		}							\ +	(_out);								\ +}) + +#define map_defined(_map)						\ +({									\ +	unsigned _in = ~0;						\ +									\ +	map_flags_rev(_map, _in);					\ +}) +  #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index c7916011ef34..67426e33d04e 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,13 @@ struct bch_xattr {  	__u8			x_type;  	__u8			x_name_len;  	__le16			x_val_len; -	__u8			x_name[] __counted_by(x_name_len); +	/* +	 * x_name contains the name and value counted by +	 * x_name_len + x_val_len. The introduction of +	 * __counted_by(x_name_len) caused a false positive +	 * detection of an out of bounds write. +	 */ +	__u8			x_name[];  } __packed __aligned(8);  #endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 584fa89bc877..4c1ea6b52a53 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -830,6 +830,7 @@ static int load_elf_binary(struct linux_binprm *bprm)  	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;  	struct elf_phdr *elf_property_phdata = NULL;  	unsigned long elf_brk; +	bool brk_moved = false;  	int retval, i;  	unsigned long elf_entry;  	unsigned long e_entry; @@ -1097,15 +1098,19 @@ out_free_interp:  			/* Calculate any requested alignment. */  			alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); -			/* -			 * There are effectively two types of ET_DYN -			 * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) -			 * and loaders (ET_DYN without PT_INTERP, since they -			 * _are_ the ELF interpreter). The loaders must -			 * be loaded away from programs since the program -			 * may otherwise collide with the loader (especially -			 * for ET_EXEC which does not have a randomized -			 * position). For example to handle invocations of +			/** +			 * DOC: PIE handling +			 * +			 * There are effectively two types of ET_DYN ELF +			 * binaries: programs (i.e. PIE: ET_DYN with +			 * PT_INTERP) and loaders (i.e. static PIE: ET_DYN +			 * without PT_INTERP, usually the ELF interpreter +			 * itself). Loaders must be loaded away from programs +			 * since the program may otherwise collide with the +			 * loader (especially for ET_EXEC which does not have +			 * a randomized position). +			 * +			 * For example, to handle invocations of  			 * "./ld.so someprog" to test out a new version of  			 * the loader, the subsequent program that the  			 * loader loads must avoid the loader itself, so @@ -1118,6 +1123,9 @@ out_free_interp:  			 * ELF_ET_DYN_BASE and loaders are loaded into the  			 * independently randomized mmap region (0 load_bias  			 * without MAP_FIXED nor MAP_FIXED_NOREPLACE). +			 * +			 * See below for "brk" handling details, which is +			 * also affected by program vs loader and ASLR.  			 */  			if (interpreter) {  				/* On ET_DYN with PT_INTERP, we do the ASLR. */ @@ -1234,8 +1242,6 @@ out_free_interp:  	start_data += load_bias;  	end_data += load_bias; -	current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); -  	if (interpreter) {  		elf_entry = load_elf_interp(interp_elf_ex,  					    interpreter, @@ -1291,27 +1297,44 @@ out_free_interp:  	mm->end_data = end_data;  	mm->start_stack = bprm->p; -	if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { +	/** +	 * DOC: "brk" handling +	 * +	 * For architectures with ELF randomization, when executing a +	 * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), +	 * move the brk area out of the mmap region and into the unused +	 * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide +	 * early with the stack growing down or other regions being put +	 * into the mmap region by the kernel (e.g. vdso). +	 * +	 * In the CONFIG_COMPAT_BRK case, though, everything is turned +	 * off because we're not allowed to move the brk at all. +	 */ +	if (!IS_ENABLED(CONFIG_COMPAT_BRK) && +	    IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && +	    elf_ex->e_type == ET_DYN && !interpreter) { +		elf_brk = ELF_ET_DYN_BASE; +		/* This counts as moving the brk, so let brk(2) know. */ +		brk_moved = true; +	} +	mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + +	if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) {  		/* -		 * For architectures with ELF randomization, when executing -		 * a loader directly (i.e. no interpreter listed in ELF -		 * headers), move the brk area out of the mmap region -		 * (since it grows up, and may collide early with the stack -		 * growing down), and into the unused ELF_ET_DYN_BASE region. +		 * If we didn't move the brk to ELF_ET_DYN_BASE (above), +		 * leave a gap between .bss and brk.  		 */ -		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && -		    elf_ex->e_type == ET_DYN && !interpreter) { -			mm->brk = mm->start_brk = ELF_ET_DYN_BASE; -		} else { -			/* Otherwise leave a gap between .bss and brk. */ +		if (!brk_moved)  			mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; -		}  		mm->brk = mm->start_brk = arch_randomize_brk(mm); +		brk_moved = true; +	} +  #ifdef compat_brk_randomized +	if (brk_moved)  		current->brk_randomized = 1;  #endif -	}  	if (current->personality & MMAP_PAGE_ZERO) {  		/* Why this, you ask???  Well SVr4 maps page 0 as read-only, diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index fa8515598341..73a2dfb854c5 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -3,9 +3,9 @@  config BTRFS_FS  	tristate "Btrfs filesystem support"  	select BLK_CGROUP_PUNT_BIO +	select CRC32  	select CRYPTO  	select CRYPTO_CRC32C -	select LIBCRC32C  	select CRYPTO_XXHASH  	select CRYPTO_SHA256  	select CRYPTO_BLAKE2B diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e7f8ee5d48a4..7f11ef559be6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -606,7 +606,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)  	free_extent_map(em);  	cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); -	cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); +	cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);  	if (!cb->compressed_folios) {  		ret = BLK_STS_RESOURCE;  		goto out_free_bio; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index d6eef4bd9e9d..de23c4b3515e 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,  				  struct btrfs_block_group *block_group)  {  	lockdep_assert_held(&discard_ctl->lock); -	if (!btrfs_run_discard_work(discard_ctl)) -		return;  	if (list_empty(&block_group->discard_list) ||  	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,  	if (!btrfs_is_block_group_data_only(block_group))  		return; +	if (!btrfs_run_discard_work(discard_ctl)) +		return; +  	spin_lock(&discard_ctl->lock);  	__add_to_discard_list(discard_ctl, block_group);  	spin_unlock(&discard_ctl->lock); @@ -244,6 +245,18 @@ again:  		    block_group->used != 0) {  			if (btrfs_is_block_group_data_only(block_group)) {  				__add_to_discard_list(discard_ctl, block_group); +				/* +				 * The block group must have been moved to other +				 * discard list even if discard was disabled in +				 * the meantime or a transaction abort happened, +				 * otherwise we can end up in an infinite loop, +				 * always jumping into the 'again' label and +				 * keep getting this block group over and over +				 * in case there are no other block groups in +				 * the discard lists. +				 */ +				ASSERT(block_group->discard_index != +				       BTRFS_DISCARD_INDEX_UNUSED);  			} else {  				list_del_init(&block_group->discard_list);  				btrfs_put_block_group(block_group); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3dd555db3d32..aa58e0663a5d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3853,7 +3853,6 @@ static int write_dev_supers(struct btrfs_device *device,  			atomic_inc(&device->sb_write_errors);  			continue;  		} -		ASSERT(folio_order(folio) == 0);  		offset = offset_in_folio(folio, bytenr);  		disk_super = folio_address(folio) + offset; @@ -3926,7 +3925,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)  		/* If the folio has been removed, then we know it completed. */  		if (IS_ERR(folio))  			continue; -		ASSERT(folio_order(folio) == 0);  		/* Folio will be unlocked once the write completes. */  		folio_wait_locked(folio); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 197f5e51c474..13bdd60da3c7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2047,7 +2047,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)  			      subpage->bitmaps)) {  			spin_unlock_irqrestore(&subpage->lock, flags);  			spin_unlock(&folio->mapping->i_private_lock); -			bit_start++; +			bit_start += sectors_per_node;  			continue;  		} @@ -3508,8 +3508,8 @@ static void btree_clear_folio_dirty_tag(struct folio *folio)  	ASSERT(folio_test_locked(folio));  	xa_lock_irq(&folio->mapping->i_pages);  	if (!folio_test_dirty(folio)) -		__xa_clear_mark(&folio->mapping->i_pages, -				folio_index(folio), PAGECACHE_TAG_DIRTY); +		__xa_clear_mark(&folio->mapping->i_pages, folio->index, +				PAGECACHE_TAG_DIRTY);  	xa_unlock_irq(&folio->mapping->i_pages);  } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2e261892c7bc..f5b28b5c4908 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -298,6 +298,8 @@ static inline int __pure num_extent_pages(const struct extent_buffer *eb)   */  static inline int __pure num_extent_folios(const struct extent_buffer *eb)  { +	if (!eb->folios[0]) +		return 0;  	if (folio_order(eb->folios[0]))  		return 1;  	return num_extent_pages(eb); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 262a707d8990..71b8a825c447 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2104,15 +2104,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,  	 * will always return true.  	 * So here we need to do extra page alignment for  	 * filemap_range_has_page(). +	 * +	 * And do not decrease page_lockend right now, as it can be 0.  	 */  	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); -	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; +	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);  	while (1) {  		truncate_pagecache_range(inode, lockstart, lockend);  		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,  			    cached_state); +		/* The same page or adjacent pages. */ +		if (page_lockend <= page_lockstart) +			break;  		/*  		 * We can't have ordered extents in the range, nor dirty/writeback  		 * pages, because we have locked the inode's VFS lock in exclusive @@ -2124,7 +2129,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,  		 * we do, unlock the range and retry.  		 */  		if (!filemap_range_has_page(inode->i_mapping, page_lockstart, -					    page_lockend)) +					    page_lockend - 1))  			break;  		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bcca43046064..7baa2ed45198 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -300,6 +300,7 @@ enum {  #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL  #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30) +#define BTRFS_WARNING_COMMIT_INTERVAL	(300)  #define BTRFS_DEFAULT_MAX_INLINE	(2048)  struct btrfs_dev_replace { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cc67d1a2d611..90f5da3c520a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1109,6 +1109,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  	struct extent_state *cached = NULL;  	struct extent_map *em;  	int ret = 0; +	bool free_pages = false;  	u64 start = async_extent->start;  	u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1129,7 +1130,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  	}  	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { +		ASSERT(!async_extent->folios); +		ASSERT(async_extent->nr_folios == 0);  		submit_uncompressed_range(inode, async_extent, locked_folio); +		free_pages = true;  		goto done;  	} @@ -1145,6 +1149,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  		 * fall back to uncompressed.  		 */  		submit_uncompressed_range(inode, async_extent, locked_folio); +		free_pages = true;  		goto done;  	} @@ -1186,6 +1191,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  done:  	if (async_chunk->blkcg_css)  		kthread_associate_blkcg(NULL); +	if (free_pages) +		free_async_extent_pages(async_extent);  	kfree(async_extent);  	return; @@ -2129,12 +2136,13 @@ next_slot:  		/*  		 * If the found extent starts after requested offset, then -		 * adjust extent_end to be right before this extent begins +		 * adjust cur_offset to be right before this extent begins.  		 */  		if (found_key.offset > cur_offset) { -			extent_end = found_key.offset; -			extent_type = 0; -			goto must_cow; +			if (cow_start == (u64)-1) +				cow_start = cur_offset; +			cur_offset = found_key.offset; +			goto next_slot;  		}  		/* @@ -5681,8 +5689,10 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)  		return inode;  	path = btrfs_alloc_path(); -	if (!path) +	if (!path) { +		iget_failed(&inode->vfs_inode);  		return ERR_PTR(-ENOMEM); +	}  	ret = btrfs_read_locked_inode(inode, path);  	btrfs_free_path(path); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a13d81bb56a0..63aeacc54945 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4902,6 +4902,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue  	ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,  				 &disk_bytenr, &disk_io_size); +	if (ret == -EAGAIN) +		goto out_acct;  	if (ret < 0 && ret != -EIOCBQUEUED)  		goto out_free; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f948f4f6431c..e17bcb034595 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3803,7 +3803,7 @@ out:  	if (ret) {  		if (inode)  			iput(&inode->vfs_inode); -		inode = ERR_PTR(ret); +		return ERR_PTR(ret);  	}  	return &inode->vfs_inode;  } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2c5edcee9450..c3b2e29e3e01 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,8 +1541,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,  	u64 extent_gen;  	int ret; -	if (unlikely(!extent_root)) { -		btrfs_err(fs_info, "no valid extent root for scrub"); +	if (unlikely(!extent_root || !csum_root)) { +		btrfs_err(fs_info, "no valid extent or csum root for scrub");  		return -EUCLEAN;  	}  	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 11dbd7be6a3b..c0a0b8b063d0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -204,7 +204,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,  			   btrfs_blocks_per_folio(fs_info, folio);	\  									\  	btrfs_subpage_assert(fs_info, folio, start, len);		\ -	__start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ +	__start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \  	__start_bit += blocks_per_folio * btrfs_bitmap_nr_##name;	\  	__start_bit;							\  }) @@ -666,7 +666,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,  				btrfs_blocks_per_folio(fs_info, folio);	\  	const struct btrfs_subpage *subpage = folio_get_private(folio);	\  									\ -	ASSERT(blocks_per_folio < BITS_PER_LONG);			\ +	ASSERT(blocks_per_folio <= BITS_PER_LONG);			\  	*dst = bitmap_read(subpage->bitmaps,				\  			   blocks_per_folio * btrfs_bitmap_nr_##name,	\  			   blocks_per_folio);				\ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 40709e2a44fc..7310e2fa8526 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -569,6 +569,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)  		break;  	case Opt_commit_interval:  		ctx->commit_interval = result.uint_32; +		if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { +			btrfs_warn(NULL, "excessive commit interval %u, use with care", +				   ctx->commit_interval); +		}  		if (ctx->commit_interval == 0)  			ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;  		break; @@ -1139,8 +1143,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)  	subvol_name = btrfs_get_subvol_name_from_objectid(info,  			btrfs_root_id(BTRFS_I(d_inode(dentry))->root));  	if (!IS_ERR(subvol_name)) { -		seq_puts(seq, ",subvol="); -		seq_escape(seq, subvol_name, " \t\n\\"); +		seq_show_option(seq, "subvol", subvol_name);  		kfree(subvol_name);  	}  	return 0; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..2b66a6130269 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2235,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb,  		btrfs_err(fs_info,  "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",  			  eb->start, check->level, found_level); -		return -EIO; +		return -EUCLEAN;  	}  	if (!check->has_first_key) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c8c21c55be53..8e6b6fed7429 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -733,82 +733,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)  	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;  } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ -	struct path path = { .mnt = NULL, .dentry = NULL }; -	char *path_buf = NULL; -	char *resolved_path; -	bool is_good = false; -	int ret; - -	if (!dev_path) -		goto out; - -	path_buf = kmalloc(PATH_MAX, GFP_KERNEL); -	if (!path_buf) -		goto out; - -	/* -	 * Do not follow soft link, just check if the original path is inside -	 * "/dev/". -	 */ -	ret = kern_path(dev_path, 0, &path); -	if (ret) -		goto out; -	resolved_path = d_path(&path, path_buf, PATH_MAX); -	if (IS_ERR(resolved_path)) -		goto out; -	if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) -		goto out; -	is_good = true; -out: -	kfree(path_buf); -	path_put(&path); -	return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ -	struct path path = { .mnt = NULL, .dentry = NULL }; -	char *path_buf = NULL; -	char *resolved_path; -	int ret; - -	if (!dev_path) { -		ret = -EINVAL; -		goto out; -	} - -	path_buf = kmalloc(PATH_MAX, GFP_KERNEL); -	if (!path_buf) { -		ret = -ENOMEM; -		goto out; -	} - -	ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); -	if (ret) -		goto out; -	resolved_path = d_path(&path, path_buf, PATH_MAX); -	if (IS_ERR(resolved_path)) { -		ret = PTR_ERR(resolved_path); -		goto out; -	} -	ret = strscpy(canonical, resolved_path, PATH_MAX); -out: -	kfree(path_buf); -	path_put(&path); -	return ret; -} -  static bool is_same_device(struct btrfs_device *device, const char *new_path)  {  	struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1513,23 +1437,12 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,  	bool new_device_added = false;  	struct btrfs_device *device = NULL;  	struct file *bdev_file; -	char *canonical_path = NULL;  	u64 bytenr;  	dev_t devt;  	int ret;  	lockdep_assert_held(&uuid_mutex); -	if (!is_good_dev_path(path)) { -		canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); -		if (canonical_path) { -			ret = get_canonical_dev_path(path, canonical_path); -			if (ret < 0) { -				kfree(canonical_path); -				canonical_path = NULL; -			} -		} -	}  	/*  	 * Avoid an exclusive open here, as the systemd-udev may initiate the  	 * device scan which may race with the user's mount or mkfs command, @@ -1574,8 +1487,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,  		goto free_disk_super;  	} -	device = device_list_add(canonical_path ? : path, disk_super, -				 &new_device_added); +	device = device_list_add(path, disk_super, &new_device_added);  	if (!IS_ERR(device) && new_device_added)  		btrfs_free_stale_devices(device->devt, device); @@ -1584,7 +1496,6 @@ free_disk_super:  error_bdev_put:  	fput(bdev_file); -	kfree(canonical_path);  	return device;  } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fb8b8b29c169..4a3e02b49f29 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1277,7 +1277,7 @@ struct zone_info {  static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,  				struct zone_info *info, unsigned long *active, -				struct btrfs_chunk_map *map) +				struct btrfs_chunk_map *map, bool new)  {  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,  		return 0;  	} +	ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); +  	/* This zone will be used for allocation, so mark this zone non-empty. */  	btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,  	 * to determine the allocation offset within the zone.  	 */  	WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + +	if (new) { +		sector_t capacity; + +		capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); +		up_read(&dev_replace->rwsem); +		info->alloc_offset = 0; +		info->capacity = capacity << SECTOR_SHIFT; + +		return 0; +	} +  	nofs_flag = memalloc_nofs_save();  	ret = btrfs_get_dev_zone(device, info->physical, &zone);  	memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)  	}  	for (i = 0; i < map->num_stripes; i++) { -		ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); +		ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);  		if (ret)  			goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)  		 * stripe.  		 */  		cache->alloc_offset = cache->zone_capacity; -		ret = 0;  	}  out: diff --git a/fs/buffer.c b/fs/buffer.c index c7abb4a029dc..7ba1807145aa 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)  }  EXPORT_SYMBOL(end_buffer_write_sync); -/* - * Various filesystems appear to want __find_get_block to be non-blocking. - * But it's the page lock which protects the buffers.  To get around this, - * we get exclusion from try_to_free_buffers with the blockdev mapping's - * i_private_lock. - * - * Hack idea: for the blockdev mapping, i_private_lock contention - * may be quite high.  This code could TryLock the page, and if that - * succeeds, there is no need to take i_private_lock. - */  static struct buffer_head * -__find_get_block_slow(struct block_device *bdev, sector_t block) +__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)  {  	struct address_space *bd_mapping = bdev->bd_mapping;  	const int blkbits = bd_mapping->host->i_blkbits; @@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)  	if (IS_ERR(folio))  		goto out; -	spin_lock(&bd_mapping->i_private_lock); +	/* +	 * Folio lock protects the buffers. Callers that cannot block +	 * will fallback to serializing vs try_to_free_buffers() via +	 * the i_private_lock. +	 */ +	if (atomic) +		spin_lock(&bd_mapping->i_private_lock); +	else +		folio_lock(folio); +  	head = folio_buffers(folio);  	if (!head)  		goto out_unlock; +	/* +	 * Upon a noref migration, the folio lock serializes here; +	 * otherwise bail. +	 */ +	if (test_bit_acquire(BH_Migrate, &head->b_state)) { +		WARN_ON(!atomic); +		goto out_unlock; +	} +  	bh = head;  	do {  		if (!buffer_mapped(bh)) @@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)  		       1 << blkbits);  	}  out_unlock: -	spin_unlock(&bd_mapping->i_private_lock); +	if (atomic) +		spin_unlock(&bd_mapping->i_private_lock); +	else +		folio_unlock(folio);  	folio_put(folio);  out:  	return ret; @@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);  void write_boundary_block(struct block_device *bdev,  			sector_t bblock, unsigned blocksize)  { -	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); +	struct buffer_head *bh; + +	bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);  	if (bh) {  		if (buffer_dirty(bh))  			write_dirty_buffer(bh, 0); @@ -1207,10 +1220,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)  	/* FIXME: do we need to set this in both places? */  	if (bh->b_folio && bh->b_folio->mapping)  		mapping_set_error(bh->b_folio->mapping, -EIO); -	if (bh->b_assoc_map) { +	if (bh->b_assoc_map)  		mapping_set_error(bh->b_assoc_map, -EIO); -		errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); -	}  }  EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1386,16 +1397,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)  /*   * Perform a pagecache lookup for the matching buffer.  If it's there, refresh   * it in the LRU and mark it as accessed.  If it is not present then return - * NULL + * NULL. Atomic context callers may also return NULL if the buffer is being + * migrated; similarly the page is not marked accessed either.   */ -struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +static struct buffer_head * +find_get_block_common(struct block_device *bdev, sector_t block, +			unsigned size, bool atomic)  {  	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);  	if (bh == NULL) {  		/* __find_get_block_slow will mark the page accessed */ -		bh = __find_get_block_slow(bdev, block); +		bh = __find_get_block_slow(bdev, block, atomic);  		if (bh)  			bh_lru_install(bh);  	} else @@ -1403,8 +1416,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)  	return bh;  } + +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ +	return find_get_block_common(bdev, block, size, true); +}  EXPORT_SYMBOL(__find_get_block); +/* same as __find_get_block() but allows sleeping contexts */ +struct buffer_head * +__find_get_block_nonatomic(struct block_device *bdev, sector_t block, +			   unsigned size) +{ +	return find_get_block_common(bdev, block, size, false); +} +EXPORT_SYMBOL(__find_get_block_nonatomic); +  /**   * bdev_getblk - Get a buffer_head in a block device's buffer cache.   * @bdev: The block device. @@ -1422,7 +1450,12 @@ EXPORT_SYMBOL(__find_get_block);  struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,  		unsigned size, gfp_t gfp)  { -	struct buffer_head *bh = __find_get_block(bdev, block, size); +	struct buffer_head *bh; + +	if (gfpflags_allow_blocking(gfp)) +		bh = __find_get_block_nonatomic(bdev, block, size); +	else +		bh = __find_get_block(bdev, block, size);  	might_alloc(gfp);  	if (bh) diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index bf935e25bdbe..b48525680e73 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -8,7 +8,7 @@  #include <linux/slab.h>  #include "internal.h" -static const char cachefiles_charmap[64] = +static const char cachefiles_charmap[64] __nonstring =  	"0123456789"			/* 0 - 9 */  	"abcdefghijklmnopqrstuvwxyz"	/* 10 - 35 */  	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"	/* 36 - 61 */ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 7249d70e1a43..3e7def3d31c1 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,7 +3,7 @@ config CEPH_FS  	tristate "Ceph distributed file system"  	depends on INET  	select CEPH_LIB -	select LIBCRC32C +	select CRC32  	select CRYPTO_AES  	select CRYPTO  	select NETFS_SUPPORT diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 6ac2bd555e86..06cd2963e41e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2367,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode,  	/* Try to writeback the dirty pagecaches */  	if (issued & (CEPH_CAP_FILE_BUFFER)) { -		loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; +		loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;  		ret = filemap_write_and_wait_range(inode->i_mapping,  						   orig_pos, lend); @@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio)  	order = folio_order(folio);  	if (!order)  		return 0; +	folio_reset_order(folio);  	for (i = 0; i < (1UL << order); i++) {  		struct dev_pagemap *pgmap = page_pgmap(&folio->page); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 42e4d6eeb29f..9c20d78e41f6 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -89,12 +89,12 @@ enum {  };  static const struct fs_parameter_spec devpts_param_specs[] = { -	fsparam_u32	("gid",		Opt_gid), +	fsparam_gid	("gid",		Opt_gid),  	fsparam_s32	("max",		Opt_max),  	fsparam_u32oct	("mode",	Opt_mode),  	fsparam_flag	("newinstance",	Opt_newinstance),  	fsparam_u32oct	("ptmxmode",	Opt_ptmxmode), -	fsparam_u32	("uid",		Opt_uid), +	fsparam_uid	("uid",		Opt_uid),  	{}  }; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 331e49cd1b8d..8f68ec49ad89 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,8 +3,8 @@  config EROFS_FS  	tristate "EROFS filesystem support"  	depends on BLOCK +	select CRC32  	select FS_IOMAP -	select LIBCRC32C  	help  	  EROFS (Enhanced Read-Only File System) is a lightweight read-only  	  file system with modern designs (e.g. no buffer heads, inline diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 9581e9bf8192..767fb4acdc93 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -56,7 +56,7 @@ struct erofs_super_block {  	union {  		__le16 rootnid_2b;	/* nid of root directory */  		__le16 blocks_hi;	/* (48BIT on) blocks count MSB */ -	} rb; +	} __packed rb;  	__le64 inos;            /* total valid ino # (== f_files - f_favail) */  	__le64 epoch;		/* base seconds used for compact inodes */  	__le32 fixed_nsec;	/* fixed nanoseconds for compact inodes */ @@ -148,7 +148,7 @@ union erofs_inode_i_nb {  	__le16 nlink;		/* if EROFS_I_NLINK_1_BIT is unset */  	__le16 blocks_hi;	/* total blocks count MSB */  	__le16 startblk_hi;	/* starting block number MSB */ -}; +} __packed;  /* 32-byte reduced form of an ondisk inode */  struct erofs_inode_compact { @@ -369,9 +369,9 @@ struct z_erofs_map_header {  			 * bit 7   : pack the whole file into packed inode  			 */  			__u8	h_clusterbits; -		}; +		} __packed;  		__le16 h_extents_hi;	/* extent count MSB */ -	}; +	} __packed;  };  enum { diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index bec4b56b3826..60c7cc4c105c 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -32,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)  		ret = 0;  	}  	if (rq->bio.bi_end_io) { +		if (ret < 0 && !rq->bio.bi_status) +			rq->bio.bi_status = errno_to_blk_status(ret);  		rq->bio.bi_end_io(&rq->bio);  	} else {  		bio_for_each_folio_all(fi, &rq->bio) { @@ -148,10 +150,10 @@ io_retry:  				io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;  				attached = 0;  			} -			if (!attached++) -				erofs_onlinefolio_split(folio);  			if (!bio_add_folio(&io->rq->bio, folio, len, cur))  				goto io_retry; +			if (!attached++) +				erofs_onlinefolio_split(folio);  			io->dev.m_pa += len;  		}  		cur += len; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index cadec6b1b554..da6ee7c39290 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -357,7 +357,6 @@ static void erofs_default_options(struct erofs_sb_info *sbi)  enum {  	Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,  	Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, -	Opt_err  };  static const struct constant_table erofs_param_cache_strategy[] = { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0671184d9cf1..b8e6b76c23d5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -79,9 +79,6 @@ struct z_erofs_pcluster {  	/* L: whether partial decompression or not */  	bool partial; -	/* L: indicate several pageofs_outs or not */ -	bool multibases; -  	/* L: whether extra buffer allocations are best-effort */  	bool besteffort; @@ -725,7 +722,6 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)  	lockref_init(&pcl->lockref); /* one ref for this request */  	pcl->algorithmformat = map->m_algorithmformat;  	pcl->pclustersize = map->m_plen; -	pcl->pageofs_in = pageofs_in;  	pcl->length = 0;  	pcl->partial = true;  	pcl->next = fe->head; @@ -1047,8 +1043,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,  				break;  			erofs_onlinefolio_split(folio); -			if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) -				f->pcl->multibases = true;  			if (f->pcl->length < offset + end - map->m_la) {  				f->pcl->length = offset + end - map->m_la;  				f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; @@ -1094,7 +1088,6 @@ struct z_erofs_backend {  	struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];  	struct super_block *sb;  	struct z_erofs_pcluster *pcl; -  	/* pages with the longest decompressed length for deduplication */  	struct page **decompressed_pages;  	/* pages to keep the compressed data */ @@ -1103,6 +1096,8 @@ struct z_erofs_backend {  	struct list_head decompressed_secondary_bvecs;  	struct page **pagepool;  	unsigned int onstack_used, nr_pages; +	/* indicate if temporary copies should be preserved for later use */ +	bool keepxcpy;  };  struct z_erofs_bvec_item { @@ -1113,18 +1108,20 @@ struct z_erofs_bvec_item {  static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,  					 struct z_erofs_bvec *bvec)  { +	int poff = bvec->offset + be->pcl->pageofs_out;  	struct z_erofs_bvec_item *item; -	unsigned int pgnr; - -	if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && -	    (bvec->end == PAGE_SIZE || -	     bvec->offset + bvec->end == be->pcl->length)) { -		pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; -		DBG_BUGON(pgnr >= be->nr_pages); -		if (!be->decompressed_pages[pgnr]) { -			be->decompressed_pages[pgnr] = bvec->page; +	struct page **page; + +	if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || +			bvec->offset + bvec->end == be->pcl->length)) { +		DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); +		page = be->decompressed_pages + (poff >> PAGE_SHIFT); +		if (!*page) { +			*page = bvec->page;  			return;  		} +	} else { +		be->keepxcpy = true;  	}  	/* (cold path) one pcluster is requested multiple times */ @@ -1290,7 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)  					.alg = pcl->algorithmformat,  					.inplace_io = overlapped,  					.partial_decoding = pcl->partial, -					.fillgaps = pcl->multibases, +					.fillgaps = be->keepxcpy,  					.gfp = pcl->besteffort ? GFP_KERNEL :  						GFP_NOWAIT | __GFP_NORETRY  				 }, be->pagepool); @@ -1347,7 +1344,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)  	pcl->length = 0;  	pcl->partial = true; -	pcl->multibases = false;  	pcl->besteffort = false;  	pcl->bvset.nextpage = NULL;  	pcl->vcnt = 0; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 8de50df05dfe..14ea47f954f5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -559,7 +559,8 @@ static int z_erofs_map_blocks_ext(struct inode *inode,  			pos += sizeof(__le64);  			lstart = 0;  		} else { -			lstart = map->m_la >> vi->z_lclusterbits; +			lstart = round_down(map->m_la, 1 << vi->z_lclusterbits); +			pos += (lstart >> vi->z_lclusterbits) * recsz;  			pa = EROFS_NULL_ADDR;  		} @@ -614,7 +615,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,  		if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {  			map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;  			vi->z_fragmentoff = map->m_plen; -			if (recsz >= offsetof(struct z_erofs_extent, pstart_lo)) +			if (recsz > offsetof(struct z_erofs_extent, pstart_lo))  				vi->z_fragmentoff |= map->m_pa << 32;  		} else if (map->m_plen) {  			map->m_flags |= EROFS_MAP_MAPPED | diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 100376863a44..d4dbffdedd08 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1996,6 +1996,14 @@ static int ep_try_send_events(struct eventpoll *ep,  	return res;  } +static int ep_schedule_timeout(ktime_t *to) +{ +	if (to) +		return ktime_after(*to, ktime_get()); +	else +		return 1; +} +  /**   * ep_poll - Retrieves ready events, and delivers them to the caller-supplied   *           event buffer. @@ -2104,8 +2112,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,  		write_unlock_irq(&ep->lock);  		if (!eavail) -			timed_out = !schedule_hrtimeout_range(to, slack, -							      HRTIMER_MODE_ABS); +			timed_out = !ep_schedule_timeout(to) || +				!schedule_hrtimeout_range(to, slack, +							  HRTIMER_MODE_ABS);  		__set_current_state(TASK_RUNNING);  		/* diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 87ee3a17bd29..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,  {  	__le32 *bref = p;  	unsigned int blk; +	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; -	if (ext4_has_feature_journal(inode->i_sb) && -	    (inode->i_ino == -	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) +	if (journal && inode == journal->j_inode)  		return 0;  	while (bref < p+max) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 38bc8d74f4cc..e7ecc7c8a729 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)  	if (!bh || !buffer_uptodate(bh))  		/*  		 * If the block is not in the buffer cache, then it -		 * must have been written out. +		 * must have been written out, or, most unlikely, is +		 * being migrated - false failure should be OK here.  		 */  		goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1dc09ed5d403..94c7d2d828a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -386,10 +386,11 @@ static int __check_block_validity(struct inode *inode, const char *func,  				unsigned int line,  				struct ext4_map_blocks *map)  { -	if (ext4_has_feature_journal(inode->i_sb) && -	    (inode->i_ino == -	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) +	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + +	if (journal && inode == journal->j_inode)  		return 0; +  	if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {  		ext4_error_inode(inode, func, line, map->m_pblk,  				 "lblock %lu mapped to illegal pblock %llu " @@ -4724,22 +4725,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)  		inode_set_iversion_queried(inode, val);  } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, +			    const char *function, unsigned int line)  { +	const char *err_str; +  	if (flags & EXT4_IGET_EA_INODE) { -		if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) -			return "missing EA_INODE flag"; +		if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { +			err_str = "missing EA_INODE flag"; +			goto error; +		}  		if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || -		    EXT4_I(inode)->i_file_acl) -			return "ea_inode with extended attributes"; +		    EXT4_I(inode)->i_file_acl) { +			err_str = "ea_inode with extended attributes"; +			goto error; +		}  	} else { -		if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) -			return "unexpected EA_INODE flag"; +		if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { +			/* +			 * open_by_handle_at() could provide an old inode number +			 * that has since been reused for an ea_inode; this does +			 * not indicate filesystem corruption +			 */ +			if (flags & EXT4_IGET_HANDLE) +				return -ESTALE; +			err_str = "unexpected EA_INODE flag"; +			goto error; +		} +	} +	if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { +		err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; +		goto error;  	} -	if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) -		return "unexpected bad inode w/o EXT4_IGET_BAD"; -	return NULL; +	return 0; + +error: +	ext4_error_inode(inode, function, line, 0, err_str); +	return -EFSCORRUPTED;  }  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4751,7 +4773,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,  	struct ext4_inode_info *ei;  	struct ext4_super_block *es = EXT4_SB(sb)->s_es;  	struct inode *inode; -	const char *err_str;  	journal_t *journal = EXT4_SB(sb)->s_journal;  	long ret;  	loff_t size; @@ -4780,10 +4801,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,  	if (!inode)  		return ERR_PTR(-ENOMEM);  	if (!(inode->i_state & I_NEW)) { -		if ((err_str = check_igot_inode(inode, flags)) != NULL) { -			ext4_error_inode(inode, function, line, 0, err_str); +		ret = check_igot_inode(inode, flags, function, line); +		if (ret) {  			iput(inode); -			return ERR_PTR(-EFSCORRUPTED); +			return ERR_PTR(ret);  		}  		return inode;  	} @@ -5065,13 +5086,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,  		ret = -EFSCORRUPTED;  		goto bad_inode;  	} -	if ((err_str = check_igot_inode(inode, flags)) != NULL) { -		ext4_error_inode(inode, function, line, 0, err_str); -		ret = -EFSCORRUPTED; -		goto bad_inode; +	ret = check_igot_inode(inode, flags, function, line); +	/* +	 * -ESTALE here means there is nothing inherently wrong with the inode, +	 * it's just not an inode we can return for an fhandle lookup. +	 */ +	if (ret == -ESTALE) { +		brelse(iloc.bh); +		unlock_new_inode(inode); +		iput(inode); +		return ERR_PTR(-ESTALE);  	} - +	if (ret) +		goto bad_inode;  	brelse(iloc.bh); +  	unlock_new_inode(inode);  	return inode; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0d523e9fb3d5..1e98c5be4e0a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)  	unsigned char blocksize_bits = min_t(unsigned char,  					     sb->s_blocksize_bits,  					     EXT4_MAX_BLOCK_LOG_SIZE); -	struct sg { -		struct ext4_group_info info; -		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; -	} sg; +	DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, +			EXT4_MAX_BLOCK_LOG_SIZE + 2);  	group--;  	if (group == 0) @@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)  			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "  			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n"); -	i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + +	i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +  		sizeof(struct ext4_group_info);  	grinfo = ext4_get_group_info(sb, group); @@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)  	 * We care only about free space counters in the group info and  	 * these are safe to access even after the buddy has been unloaded  	 */ -	memcpy(&sg, grinfo, i); -	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, -			sg.info.bb_fragments, sg.info.bb_first_free); +	memcpy(sg, grinfo, i); +	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, +			sg->bb_fragments, sg->bb_first_free);  	for (i = 0; i <= 13; i++)  		seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? -				sg.info.bb_counters[i] : 0); +				sg->bb_counters[i] : 0);  	seq_puts(seq, " ]"); -	if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) +	if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))  		seq_puts(seq, " Block bitmap corrupted!");  	seq_putc(seq, '\n');  	return 0; @@ -6644,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,  		for (i = 0; i < count; i++) {  			cond_resched();  			if (is_metadata) -				bh = sb_find_get_block(inode->i_sb, block + i); +				bh = sb_find_get_block_nonatomic(inode->i_sb, +								 block + i);  			ext4_forget(handle, is_metadata, inode, bh, block + i);  		}  	} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cb5cb33b1d91..e9712e64ec8f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1971,7 +1971,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,  	 * split it in half by count; each resulting block will have at least  	 * half the space free.  	 */ -	if (i > 0) +	if (i >= 0)  		split = count - move;  	else  		split = count/2; diff --git a/fs/file.c b/fs/file.c index dc3f7e120e3e..3a3146664cf3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,7 +26,7 @@  #include "internal.h" -bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)  {  	/*  	 * If the reference count was already in the dead zone, then this diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 2c7b24cb67ad..53c2626e90e7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1669,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)  	unsigned int virtqueue_size;  	int err = -EIO; +	if (!fsc->source) +		return invalf(fsc, "No source specified"); +  	/* This gets a reference on virtio_fs object. This ptr gets installed  	 * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()  	 * to drop the reference to this object. diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index be7f87a8e11a..7bd231d16d4a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,7 +4,6 @@ config GFS2_FS  	select BUFFER_HEAD  	select FS_POSIX_ACL  	select CRC32 -	select LIBCRC32C  	select QUOTACTL  	select FS_IOMAP  	help diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6add6ebfef89..cb823a8a6ba9 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)  	else  		key_len = tree->max_key_len + 1; +	if (key_len > sizeof(hfs_btree_key) || key_len < 1) { +		memset(key, 0, sizeof(hfs_btree_key)); +		pr_err("hfs: Invalid key length: %d\n", key_len); +		return; +	} +  	hfs_bnode_read(node, key, off, key_len);  } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 87974d5e6791..079ea80534f7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)  	else  		key_len = tree->max_key_len + 2; +	if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { +		memset(key, 0, sizeof(hfsplus_btree_key)); +		pr_err("hfsplus: Invalid key length: %d\n", key_len); +		return; +	} +  	hfs_bnode_read(node, key, off, key_len);  } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 31553372b33a..5b08bd417b28 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -259,7 +259,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,  		}  		/* truncate len if we find any trailing uptodate block(s) */ -		for ( ; i <= last; i++) { +		while (++i <= last) {  			if (ifs_block_is_uptodate(ifs, i)) {  				plen -= (last - i + 1) * block_size;  				last = i - 1; diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 35768a63fb1d..421d247fae52 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,  		return NULL;  	return isofs_export_iget(sb, -			fh_len > 2 ? ifid->parent_block : 0, +			fh_len > 3 ? ifid->parent_block : 0,  			ifid->parent_offset,  			fh_len > 4 ? ifid->parent_generation : 0);  } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 0cf0fddbee81..1467f6790747 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,  	bh = bh_in;  	if (!bh) { -		bh = __find_get_block(bdev, blocknr, journal->j_blocksize); +		bh = __find_get_block_nonatomic(bdev, blocknr, +						journal->j_blocksize);  		if (bh)  			BUFFER_TRACE(bh, "found on hash");  	} @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,  		/* If there is a different buffer_head lying around in  		 * memory anywhere... */ -		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); +		bh2 = __find_get_block_nonatomic(bdev, blocknr, +						 journal->j_blocksize);  		if (bh2) {  			/* ... and it has RevokeValid status... */  			if (bh2 != bh && buffer_revokevalid(bh2)) @@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)  	 * state machine will get very upset later on. */  	if (need_cancel) {  		struct buffer_head *bh2; -		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); +		bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, +						 bh->b_size);  		if (bh2) {  			if (bh2 != bh)  				clear_buffer_revoked(bh2); @@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)  			struct jbd2_revoke_record_s *record;  			struct buffer_head *bh;  			record = (struct jbd2_revoke_record_s *)list_entry; -			bh = __find_get_block(journal->j_fs_dev, -					      record->blocknr, -					      journal->j_blocksize); +			bh = __find_get_block_nonatomic(journal->j_fs_dev, +							record->blocknr, +							journal->j_blocksize);  			if (bh) {  				clear_buffer_revoked(bh);  				__brelse(bh); diff --git a/fs/namei.c b/fs/namei.c index 360a86ca1f02..84a0e0b0111c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,9 +125,9 @@  #define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname)) -static inline void initname(struct filename *name) +static inline void initname(struct filename *name, const char __user *uptr)  { -	name->uptr = NULL; +	name->uptr = uptr;  	name->aname = NULL;  	atomic_set(&name->refcnt, 1);  } @@ -210,7 +210,7 @@ getname_flags(const char __user *filename, int flags)  			return ERR_PTR(-ENAMETOOLONG);  		}  	} -	initname(result); +	initname(result, filename);  	audit_getname(result);  	return result;  } @@ -268,7 +268,7 @@ struct filename *getname_kernel(const char * filename)  		return ERR_PTR(-ENAMETOOLONG);  	}  	memcpy((char *)result->name, filename, len); -	initname(result); +	initname(result, NULL);  	audit_getname(result);  	return result;  } @@ -1665,27 +1665,20 @@ static struct dentry *lookup_dcache(const struct qstr *name,  	return dentry;  } -/* - * Parent directory has inode locked exclusive.  This is one - * and only case when ->lookup() gets called on non in-lookup - * dentries - as the matter of fact, this only gets called - * when directory is guaranteed to have no in-lookup children - * at all. - * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. - * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. - */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, -				    struct dentry *base, -				    unsigned int flags) +static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, +					       struct dentry *base, +					       unsigned int flags)  { -	struct dentry *dentry = lookup_dcache(name, base, flags); +	struct dentry *dentry;  	struct dentry *old; -	struct inode *dir = base->d_inode; +	struct inode *dir; +	dentry = lookup_dcache(name, base, flags);  	if (dentry) -		goto found; +		return dentry;  	/* Don't create child dentry for a dead directory. */ +	dir = base->d_inode;  	if (unlikely(IS_DEADDIR(dir)))  		return ERR_PTR(-ENOENT); @@ -1698,7 +1691,24 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,  		dput(dentry);  		dentry = old;  	} -found: +	return dentry; +} + +/* + * Parent directory has inode locked exclusive.  This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. + */ +struct dentry *lookup_one_qstr_excl(const struct qstr *name, +				    struct dentry *base, unsigned int flags) +{ +	struct dentry *dentry; + +	dentry = lookup_one_qstr_excl_raw(name, base, flags);  	if (IS_ERR(dentry))  		return dentry;  	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { @@ -2742,23 +2752,48 @@ static int filename_parentat(int dfd, struct filename *name,  /* does lookup, returns the object with parent locked */  static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)  { +	struct path parent_path __free(path_put) = {};  	struct dentry *d;  	struct qstr last;  	int type, error; -	error = filename_parentat(dfd, name, 0, path, &last, &type); +	error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);  	if (error)  		return ERR_PTR(error); -	if (unlikely(type != LAST_NORM)) { -		path_put(path); +	if (unlikely(type != LAST_NORM))  		return ERR_PTR(-EINVAL); +	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); +	d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); +	if (IS_ERR(d)) { +		inode_unlock(parent_path.dentry->d_inode); +		return d;  	} -	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); -	d = lookup_one_qstr_excl(&last, path->dentry, 0); +	path->dentry = no_free_ptr(parent_path.dentry); +	path->mnt = no_free_ptr(parent_path.mnt); +	return d; +} + +struct dentry *kern_path_locked_negative(const char *name, struct path *path) +{ +	struct path parent_path __free(path_put) = {}; +	struct filename *filename __free(putname) = getname_kernel(name); +	struct dentry *d; +	struct qstr last; +	int type, error; + +	error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); +	if (error) +		return ERR_PTR(error); +	if (unlikely(type != LAST_NORM)) +		return ERR_PTR(-EINVAL); +	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); +	d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0);  	if (IS_ERR(d)) { -		inode_unlock(path->dentry->d_inode); -		path_put(path); +		inode_unlock(parent_path.dentry->d_inode); +		return d;  	} +	path->dentry = no_free_ptr(parent_path.dentry); +	path->mnt = no_free_ptr(parent_path.mnt);  	return d;  } diff --git a/fs/namespace.c b/fs/namespace.c index 14935a0500a2..1b466c54a357 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -787,15 +787,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)  		return 0;  	mnt = real_mount(bastard);  	mnt_add_count(mnt, 1); -	smp_mb();			// see mntput_no_expire() +	smp_mb();		// see mntput_no_expire() and do_umount()  	if (likely(!read_seqretry(&mount_lock, seq)))  		return 0; -	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { -		mnt_add_count(mnt, -1); -		return 1; -	}  	lock_mount_hash(); -	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { +	if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {  		mnt_add_count(mnt, -1);  		unlock_mount_hash();  		return 1; @@ -1830,6 +1826,8 @@ static inline void namespace_lock(void)  	down_write(&namespace_sem);  } +DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) +  enum umount_tree_flags {  	UMOUNT_SYNC = 1,  	UMOUNT_PROPAGATE = 2, @@ -2046,6 +2044,7 @@ static int do_umount(struct mount *mnt, int flags)  			umount_tree(mnt, UMOUNT_PROPAGATE);  		retval = 0;  	} else { +		smp_mb(); // paired with __legitimize_mnt()  		shrink_submounts(mnt);  		retval = -EBUSY;  		if (!propagate_mount_busy(mnt, 2)) { @@ -2383,7 +2382,7 @@ void dissolve_on_fput(struct vfsmount *mnt)  			return;  	} -	scoped_guard(rwsem_write, &namespace_sem) { +	scoped_guard(namespace_lock, &namespace_sem) {  		ns = m->mnt_ns;  		if (!must_dissolve(ns))  			return; @@ -2824,56 +2823,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)  	struct vfsmount *mnt = path->mnt;  	struct dentry *dentry;  	struct mountpoint *mp = ERR_PTR(-ENOENT); +	struct path under = {};  	for (;;) { -		struct mount *m; +		struct mount *m = real_mount(mnt);  		if (beneath) { -			m = real_mount(mnt); +			path_put(&under);  			read_seqlock_excl(&mount_lock); -			dentry = dget(m->mnt_mountpoint); +			under.mnt = mntget(&m->mnt_parent->mnt); +			under.dentry = dget(m->mnt_mountpoint);  			read_sequnlock_excl(&mount_lock); +			dentry = under.dentry;  		} else {  			dentry = path->dentry;  		}  		inode_lock(dentry->d_inode); -		if (unlikely(cant_mount(dentry))) { -			inode_unlock(dentry->d_inode); -			goto out; -		} -  		namespace_lock(); -		if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { +		if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) +			break;		// not to be mounted on + +		if (beneath && unlikely(m->mnt_mountpoint != dentry || +				        &m->mnt_parent->mnt != under.mnt)) {  			namespace_unlock();  			inode_unlock(dentry->d_inode); -			goto out; +			continue;	// got moved  		}  		mnt = lookup_mnt(path); -		if (likely(!mnt)) +		if (unlikely(mnt)) { +			namespace_unlock(); +			inode_unlock(dentry->d_inode); +			path_put(path); +			path->mnt = mnt; +			path->dentry = dget(mnt->mnt_root); +			continue;	// got overmounted +		} +		mp = get_mountpoint(dentry); +		if (IS_ERR(mp))  			break; - -		namespace_unlock(); -		inode_unlock(dentry->d_inode); -		if (beneath) -			dput(dentry); -		path_put(path); -		path->mnt = mnt; -		path->dentry = dget(mnt->mnt_root); -	} - -	mp = get_mountpoint(dentry); -	if (IS_ERR(mp)) { -		namespace_unlock(); -		inode_unlock(dentry->d_inode); +		if (beneath) { +			/* +			 * @under duplicates the references that will stay +			 * at least until namespace_unlock(), so the path_put() +			 * below is safe (and OK to do under namespace_lock - +			 * we are not dropping the final references here). +			 */ +			path_put(&under); +		} +		return mp;  	} - -out: +	namespace_unlock(); +	inode_unlock(dentry->d_inode);  	if (beneath) -		dput(dentry); - +		path_put(&under);  	return mp;  } @@ -2884,14 +2889,11 @@ static inline struct mountpoint *lock_mount(struct path *path)  static void unlock_mount(struct mountpoint *where)  { -	struct dentry *dentry = where->m_dentry; - +	inode_unlock(where->m_dentry->d_inode);  	read_seqlock_excl(&mount_lock);  	put_mountpoint(where);  	read_sequnlock_excl(&mount_lock); -  	namespace_unlock(); -	inode_unlock(dentry->d_inode);  }  static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -3555,7 +3557,8 @@ static int can_move_mount_beneath(const struct path *from,  	 * @mnt_from itself. This defeats the whole purpose of mounting  	 * @mnt_from beneath @mnt_to.  	 */ -	if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) +	if (check_mnt(mnt_from) && +	    propagation_would_overmount(parent_mnt_to, mnt_from, mp))  		return -EINVAL;  	return 0; @@ -3713,15 +3716,14 @@ static int do_move_mount(struct path *old_path,  	if (err)  		goto out; -	if (is_anon_ns(ns)) -		ns->mntns_flags &= ~MNTNS_PROPAGATING; -  	/* if the mount is moved, it should no longer be expire  	 * automatically */  	list_del_init(&old->mnt_expire);  	if (attached)  		put_mountpoint(old_mp);  out: +	if (is_anon_ns(ns)) +		ns->mntns_flags &= ~MNTNS_PROPAGATING;  	unlock_mount(mp);  	if (!err) {  		if (attached) { @@ -5189,8 +5191,8 @@ static void finish_mount_kattr(struct mount_kattr *kattr)  		mnt_idmap_put(kattr->mnt_idmap);  } -static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, -			      struct mount_kattr *kattr) +static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, +			       struct mount_kattr *kattr)  {  	int ret;  	struct mount_attr attr; @@ -5213,9 +5215,13 @@ static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,  	if (attr.attr_set == 0 &&  	    attr.attr_clr == 0 &&  	    attr.propagation == 0) -		return 0; +		return 0; /* Tell caller to not bother. */ + +	ret = build_mount_kattr(&attr, usize, kattr); +	if (ret < 0) +		return ret; -	return build_mount_kattr(&attr, usize, kattr); +	return 1;  }  SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, @@ -5247,8 +5253,8 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,  	if (flags & AT_RECURSIVE)  		kattr.kflags |= MOUNT_KATTR_RECURSE; -	err = copy_mount_setattr(uattr, usize, &kattr); -	if (err) +	err = wants_mount_setattr(uattr, usize, &kattr); +	if (err <= 0)  		return err;  	err = user_path_at(dfd, path, kattr.lookup_flags, &target); @@ -5282,15 +5288,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,  		if (flags & AT_RECURSIVE)  			kattr.kflags |= MOUNT_KATTR_RECURSE; -		ret = copy_mount_setattr(uattr, usize, &kattr); -		if (ret) +		ret = wants_mount_setattr(uattr, usize, &kattr); +		if (ret < 0)  			return ret; -		ret = do_mount_setattr(&file->f_path, &kattr); -		if (ret) -			return ret; +		if (ret) { +			ret = do_mount_setattr(&file->f_path, &kattr); +			if (ret) +				return ret; -		finish_mount_kattr(&kattr); +			finish_mount_kattr(&kattr); +		}  	}  	fd = get_unused_fd_flags(flags & O_CLOEXEC); diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..8f70f8da064b 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache)  EXPORT_SYMBOL(fscache_withdraw_cache);  #ifdef CONFIG_PROC_FS -static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW";  /*   * Generate a list of caches in /proc/fs/fscache/caches diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index d4d4b3a8b106..3d56fc73435f 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru);  static DEFINE_SPINLOCK(fscache_cookie_lru_lock);  DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);  static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); -static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD";  static unsigned int fscache_lru_cookie_timeout = 10 * HZ;  void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 4e3e62040831..70ecc8f5f210 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -127,11 +127,13 @@ static int __init netfs_init(void)  	if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)  		goto error_subreqpool; +#ifdef CONFIG_PROC_FS  	if (!proc_mkdir("fs/netfs", NULL))  		goto error_proc;  	if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,  			     &netfs_requests_seq_ops))  		goto error_procfile; +#endif  #ifdef CONFIG_FSCACHE_STATS  	if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,  				netfs_stats_show)) @@ -144,9 +146,11 @@ static int __init netfs_init(void)  	return 0;  error_fscache: +#ifdef CONFIG_PROC_FS  error_procfile:  	remove_proc_subtree("fs/netfs", NULL);  error_proc: +#endif  	mempool_exit(&netfs_subrequest_pool);  error_subreqpool:  	kmem_cache_destroy(netfs_subrequest_slab); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index d3f76101ad4b..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,6 +2,7 @@  config NFS_FS  	tristate "NFS client support"  	depends on INET && FILE_LOCKING && MULTIUSER +	select CRC32  	select LOCKD  	select SUNRPC  	select NFS_COMMON @@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS  config NFS_DEBUG  	bool  	depends on NFS_FS && SUNRPC_DEBUG -	select CRC32  	default y  config NFS_DISABLE_UDP_SUPPORT diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 02c916a55020..6d63b958c4bb 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1105,6 +1105,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)  		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)  			server->namelen = NFS2_MAXNAMLEN;  	} +	/* Linux 'subtree_check' borkenness mandates this setting */ +	server->fh_expire_type = NFS_FH_VOL_RENAME;  	if (!(fattr->valid & NFS_ATTR_FATTR)) {  		error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, @@ -1200,6 +1202,10 @@ void nfs_clients_init(struct net *net)  #if IS_ENABLED(CONFIG_NFS_V4)  	idr_init(&nn->cb_ident_idr);  #endif +#if IS_ENABLED(CONFIG_NFS_V4_1) +	INIT_LIST_HEAD(&nn->nfs4_data_server_cache); +	spin_lock_init(&nn->nfs4_data_server_lock); +#endif  	spin_lock_init(&nn->nfs_client_lock);  	nn->boot_time = ktime_get_real();  	memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); @@ -1216,6 +1222,9 @@ void nfs_clients_exit(struct net *net)  	nfs_cleanup_cb_ident_idr(net);  	WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));  	WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) +	WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif  }  #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bd23fc736b39..d0e0b435a843 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2676,6 +2676,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)  	unblock_revalidate(new_dentry);  } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, +					   struct dentry *new_dentry) +{ +	struct nfs_server *server = NFS_SB(old_dentry->d_sb); + +	if (old_dentry->d_parent != new_dentry->d_parent) +		return false; +	if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) +		return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); +	return true; +} +  /*   * RENAME   * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2763,7 +2775,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,  	} -	if (S_ISREG(old_inode->i_mode)) +	if (S_ISREG(old_inode->i_mode) && +	    nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry))  		nfs_sync_inode(old_inode);  	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,  				must_unblock ? nfs_unblock_rename : NULL); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f32f8d7c9122..48d89716193a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -757,7 +757,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  {  	struct nfs_direct_req *dreq = hdr->dreq;  	struct nfs_commit_info cinfo; -	struct nfs_page *req = nfs_list_entry(hdr->pages.next);  	struct inode *inode = dreq->inode;  	int flags = NFS_ODIRECT_DONE; @@ -786,6 +785,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  	spin_unlock(&inode->i_lock);  	while (!list_empty(&hdr->pages)) { +		struct nfs_page *req;  		req = nfs_list_entry(hdr->pages.next);  		nfs_list_remove_request(req); diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4fa304fa5bc4..29d9234d5c08 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  	struct page *scratch;  	struct list_head dsaddrs;  	struct nfs4_pnfs_ds_addr *da; +	struct net *net = server->nfs_client->cl_net;  	/* set up xdr stream */  	scratch = alloc_page(gfp_flags); @@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  		mp_count = be32_to_cpup(p); /* multipath count */  		for (j = 0; j < mp_count; j++) { -			da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, -						    &stream, gfp_flags); +			da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);  			if (da)  				list_add_tail(&da->da_node, &dsaddrs);  		} @@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  			goto out_err_free_deviceid;  		} -		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); +		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);  		if (!dsaddr->ds_list[i])  			goto out_err_drain_dsaddrs;  		trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 61ad269c825f..e6909cafab68 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1329,7 +1329,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,  					    hdr->args.offset, hdr->args.count,  					    &hdr->res.op_status, OP_READ,  					    task->tk_status); -		trace_ff_layout_read_error(hdr); +		trace_ff_layout_read_error(hdr, task->tk_status);  	}  	err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1502,7 +1502,7 @@ static int ff_layout_write_done_cb(struct rpc_task *task,  					    hdr->args.offset, hdr->args.count,  					    &hdr->res.op_status, OP_WRITE,  					    task->tk_status); -		trace_ff_layout_write_error(hdr); +		trace_ff_layout_write_error(hdr, task->tk_status);  	}  	err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1551,7 +1551,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,  					    data->args.offset, data->args.count,  					    &data->res.op_status, OP_COMMIT,  					    task->tk_status); -		trace_ff_layout_commit_error(data); +		trace_ff_layout_commit_error(data, task->tk_status);  	}  	err = ff_layout_async_handle_error(task, NULL, data->ds_clp, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e58bedfb1dcc..4a304cf17c4b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  	struct nfs4_pnfs_ds_addr *da;  	struct nfs4_ff_layout_ds *new_ds = NULL;  	struct nfs4_ff_ds_version *ds_versions = NULL; +	struct net *net = server->nfs_client->cl_net;  	u32 mp_count;  	u32 version_count;  	__be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  	for (i = 0; i < mp_count; i++) {  		/* multipath ds */ -		da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, -					    &stream, gfp_flags); +		da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);  		if (da)  			list_add_tail(&da->da_node, &dsaddrs);  	} @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,  	new_ds->ds_versions = ds_versions;  	new_ds->ds_versions_cnt = version_count; -	new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); +	new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);  	if (!new_ds->ds)  		goto out_err_drain_dsaddrs; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ec8d32d0e2e9..6655e5f32ec6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -899,18 +899,11 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)  	return ((u64)ts->tv_sec << 30) + ts->tv_nsec;  } -#ifdef CONFIG_CRC32  static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)  {  	return ~crc32_le(0xFFFFFFFF, &stateid->other[0],  				NFS4_STATEID_OTHER_SIZE);  } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) -{ -	return 0; -} -#endif  static inline bool nfs_current_task_exiting(void)  { diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 5c21caeae075..4ec952f9f47d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -278,6 +278,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,  		new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);  		if (IS_ERR(new))  			return NULL; +		rcu_read_lock();  		/* try to swap in the pointer */  		spin_lock(&clp->cl_uuid.lock);  		nf = rcu_dereference_protected(*pnf, 1); @@ -287,7 +288,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,  			rcu_assign_pointer(*pnf, nf);  		}  		spin_unlock(&clp->cl_uuid.lock); -		rcu_read_lock();  	}  	nf = nfs_local_file_get(nf);  	rcu_read_unlock(); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index a68b21603ea9..6ba3ea39e928 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -31,7 +31,11 @@ struct nfs_net {  	unsigned short nfs_callback_tcpport;  	unsigned short nfs_callback_tcpport6;  	int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) +	struct list_head nfs4_data_server_cache; +	spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */  	struct nfs_netns_client *nfs_client;  	spinlock_t nfs_client_lock;  	ktime_t boot_time; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 18d8f6529f61..a126eb31f62f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu)  	switch (status) {  		case 0: -			status = nfs_refresh_inode(inode, res.fattr); +			nfs_refresh_inode(inode, res.fattr);  			break;  		case -EPFNOSUPPORT:  		case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 970f28dbf253..b1d2122bd5a7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -671,6 +671,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,  	struct nfs_client *clp = server->nfs_client;  	int ret; +	if ((task->tk_rpc_status == -ENETDOWN || +	     task->tk_rpc_status == -ENETUNREACH) && +	    task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { +		exception->delay = 0; +		exception->recovering = 0; +		exception->retry = 0; +		return -EIO; +	} +  	ret = nfs4_do_handle_exception(server, errorcode, exception);  	if (exception->delay) {  		int ret2 = nfs4_exception_should_retrans(server, exception); @@ -7074,10 +7083,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,  	struct nfs4_unlockdata *p;  	struct nfs4_state *state = lsp->ls_state;  	struct inode *inode = state->inode; +	struct nfs_lock_context *l_ctx;  	p = kzalloc(sizeof(*p), GFP_KERNEL);  	if (p == NULL)  		return NULL; +	l_ctx = nfs_get_lock_context(ctx); +	if (!IS_ERR(l_ctx)) { +		p->l_ctx = l_ctx; +	} else { +		kfree(p); +		return NULL; +	}  	p->arg.fh = NFS_FH(inode);  	p->arg.fl = &p->fl;  	p->arg.seqid = seqid; @@ -7085,7 +7102,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,  	p->lsp = lsp;  	/* Ensure we don't close file until we're done freeing locks! */  	p->ctx = get_nfs_open_context(ctx); -	p->l_ctx = nfs_get_lock_context(ctx);  	locks_init_lock(&p->fl);  	locks_copy_lock(&p->fl, fl);  	p->server = NFS_SERVER(inode); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,  	memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);  } -#ifdef CONFIG_CRC32  /*   * nfs_session_id_hash - calculate the crc32 hash for the session id   * @session - pointer to session   */  #define nfs_session_id_hash(sess_id) \  	(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif  #else /* defined(CONFIG_NFS_V4_1) */  static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index bc67fe6801b1..deab4c0e21a0 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2051,13 +2051,15 @@ TRACE_EVENT(fl_getdevinfo,  DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,  		TP_PROTO( -			const struct nfs_pgio_header *hdr +			const struct nfs_pgio_header *hdr, +			int error  		), -		TP_ARGS(hdr), +		TP_ARGS(hdr, error),  		TP_STRUCT__entry(  			__field(unsigned long, error) +			__field(unsigned long, nfs_error)  			__field(dev_t, dev)  			__field(u32, fhandle)  			__field(u64, fileid) @@ -2073,7 +2075,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,  		TP_fast_assign(  			const struct inode *inode = hdr->inode; -			__entry->error = hdr->res.op_status; +			__entry->error = -error; +			__entry->nfs_error = hdr->res.op_status;  			__entry->fhandle = nfs_fhandle_hash(hdr->args.fh);  			__entry->fileid = NFS_FILEID(inode);  			__entry->dev = inode->i_sb->s_dev; @@ -2088,7 +2091,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,  		TP_printk(  			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " -			"offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s", +			"offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " +			"nfs_error=%lu (%s)",  			-__entry->error,  			show_nfs4_status(__entry->error),  			MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2096,28 +2100,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,  			__entry->fhandle,  			__entry->offset, __entry->count,  			__entry->stateid_seq, __entry->stateid_hash, -			__get_str(dstaddr) +			__get_str(dstaddr), __entry->nfs_error, +			show_nfs4_status(__entry->nfs_error)  		)  );  #define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \  	DEFINE_EVENT(nfs4_flexfiles_io_event, name, \  			TP_PROTO( \ -				const struct nfs_pgio_header *hdr \ +				const struct nfs_pgio_header *hdr, \ +				int error \  			), \ -			TP_ARGS(hdr)) +			TP_ARGS(hdr, error))  DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error);  DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error);  TRACE_EVENT(ff_layout_commit_error,  		TP_PROTO( -			const struct nfs_commit_data *data +			const struct nfs_commit_data *data, +			int error  		), -		TP_ARGS(data), +		TP_ARGS(data, error),  		TP_STRUCT__entry(  			__field(unsigned long, error) +			__field(unsigned long, nfs_error)  			__field(dev_t, dev)  			__field(u32, fhandle)  			__field(u64, fileid) @@ -2131,7 +2139,8 @@ TRACE_EVENT(ff_layout_commit_error,  		TP_fast_assign(  			const struct inode *inode = data->inode; -			__entry->error = data->res.op_status; +			__entry->error = -error; +			__entry->nfs_error = data->res.op_status;  			__entry->fhandle = nfs_fhandle_hash(data->args.fh);  			__entry->fileid = NFS_FILEID(inode);  			__entry->dev = inode->i_sb->s_dev; @@ -2142,14 +2151,15 @@ TRACE_EVENT(ff_layout_commit_error,  		TP_printk(  			"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " -			"offset=%llu count=%u dstaddr=%s", +			"offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)",  			-__entry->error,  			show_nfs4_status(__entry->error),  			MAJOR(__entry->dev), MINOR(__entry->dev),  			(unsigned long long)__entry->fileid,  			__entry->fhandle,  			__entry->offset, __entry->count, -			__get_str(dstaddr) +			__get_str(dstaddr), __entry->nfs_error, +			show_nfs4_status(__entry->nfs_error)  		)  ); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f582713bf05..3adb7d0dbec7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,  	return remaining;  } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ +	struct pnfs_layout_segment *lseg; + +	list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) +		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} +  static void  pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,  		struct list_head *free_me, @@ -1246,21 +1254,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode,  static void  pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo,  				     const nfs4_stateid *arg_stateid, -				     const struct pnfs_layout_range *range) +				     const struct pnfs_layout_range *range, +				     struct list_head *freeme)  { -	const struct pnfs_layout_segment *lseg; -	u32 seq = be32_to_cpu(arg_stateid->seqid); -  	if (pnfs_layout_is_valid(lo) && -	    nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) { -		list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) { -			if (pnfs_seqid_is_newer(lseg->pls_seq, seq) || -			    !pnfs_should_free_range(&lseg->pls_range, range)) -				continue; -			pnfs_set_plh_return_info(lo, range->iomode, seq); -			break; -		} -	} +	    nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) +		pnfs_reset_return_info(lo); +	else +		pnfs_mark_layout_stateid_invalid(lo, freeme); +	pnfs_clear_layoutreturn_waitbit(lo);  }  void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, @@ -1268,11 +1270,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,  				   const struct pnfs_layout_range *range)  {  	struct inode *inode = lo->plh_inode; +	LIST_HEAD(freeme);  	spin_lock(&inode->i_lock); -	pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range); -	pnfs_clear_layoutreturn_waitbit(lo); +	pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme);  	spin_unlock(&inode->i_lock); +	pnfs_free_lseg_list(&freeme);  }  void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, @@ -1292,6 +1295,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,  		pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);  		pnfs_free_returned_lsegs(lo, &freeme, range, seq);  		pnfs_set_layout_stateid(lo, stateid, NULL, true); +		pnfs_reset_return_info(lo);  	} else  		pnfs_mark_layout_stateid_invalid(lo, &freeme);  out_unlock: @@ -1661,6 +1665,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,  		/* Was there an RPC level error? If not, retry */  		if (task->tk_rpc_status == 0)  			break; +		/* +		 * Is there a fatal network level error? +		 * If so release the layout, but flag the error. +		 */ +		if ((task->tk_rpc_status == -ENETDOWN || +		     task->tk_rpc_status == -ENETUNREACH) && +		    task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { +			*ret = 0; +			(*respp)->lrs_present = 0; +			retval = -EIO; +			break; +		}  		/* If the call was not sent, let caller handle it */  		if (!RPC_WAS_SENT(task))  			return 0; @@ -1695,6 +1711,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,  	struct inode *inode = args->inode;  	const nfs4_stateid *res_stateid = NULL;  	struct nfs4_xdr_opaque_data *ld_private = args->ld_private; +	LIST_HEAD(freeme);  	switch (ret) {  	case -NFS4ERR_BADSESSION: @@ -1703,9 +1720,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,  	case -NFS4ERR_NOMATCHING_LAYOUT:  		spin_lock(&inode->i_lock);  		pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, -						     &args->range); -		pnfs_clear_layoutreturn_waitbit(lo); +						     &args->range, &freeme);  		spin_unlock(&inode->i_lock); +		pnfs_free_lseg_list(&freeme);  		break;  	case 0:  		if (res->lrs_present) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 30d2613e912b..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -60,6 +60,7 @@ struct nfs4_pnfs_ds {  	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */  	char			*ds_remotestr;	/* comma sep list of addrs */  	struct list_head	ds_addrs; +	const struct net	*ds_net;  	struct nfs_client	*ds_clp;  	refcount_t		ds_count;  	unsigned long		ds_state; @@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode,  int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);  void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);  void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, +				      struct list_head *dsaddrs,  				      gfp_t gfp_flags);  void nfs4_pnfs_v3_ds_connect_unload(void);  int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index dbef837e871a..91ef486f40b9 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -16,6 +16,7 @@  #include "nfs4session.h"  #include "internal.h"  #include "pnfs.h" +#include "netns.h"  #define NFSDBG_FACILITY		NFSDBG_PNFS @@ -504,14 +505,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);  /*   * Data server cache   * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting:   *   - set to 1 on allocation   *   - incremented when a device id maps a data server already in the cache.   *   - decremented when deviceid is removed from the cache.   */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache);  /* Debug routines */  static void @@ -604,11 +605,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,   * Lookup DS by addresses.  nfs4_ds_cache_lock is held   */  static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs)  {  	struct nfs4_pnfs_ds *ds; -	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) +	list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)  		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))  			return ds;  	return NULL; @@ -653,10 +654,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)  void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)  { -	if (refcount_dec_and_lock(&ds->ds_count, -				&nfs4_ds_cache_lock)) { +	struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + +	if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) {  		list_del_init(&ds->ds_node); -		spin_unlock(&nfs4_ds_cache_lock); +		spin_unlock(&nn->nfs4_data_server_lock);  		destroy_ds(ds);  	}  } @@ -716,8 +718,9 @@ out_err:   * uncached and return cached struct nfs4_pnfs_ds.   */  struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags)  { +	struct nfs_net *nn = net_generic(net, nfs_net_id);  	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;  	char *remotestr; @@ -733,16 +736,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)  	/* this is only used for debugging, so it's ok if its NULL */  	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); -	spin_lock(&nfs4_ds_cache_lock); -	tmp_ds = _data_server_lookup_locked(dsaddrs); +	spin_lock(&nn->nfs4_data_server_lock); +	tmp_ds = _data_server_lookup_locked(nn, dsaddrs);  	if (tmp_ds == NULL) {  		INIT_LIST_HEAD(&ds->ds_addrs);  		list_splice_init(dsaddrs, &ds->ds_addrs);  		ds->ds_remotestr = remotestr;  		refcount_set(&ds->ds_count, 1);  		INIT_LIST_HEAD(&ds->ds_node); +		ds->ds_net = net;  		ds->ds_clp = NULL; -		list_add(&ds->ds_node, &nfs4_data_server_cache); +		list_add(&ds->ds_node, &nn->nfs4_data_server_cache);  		dprintk("%s add new data server %s\n", __func__,  			ds->ds_remotestr);  	} else { @@ -754,7 +758,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)  			refcount_read(&tmp_ds->ds_count));  		ds = tmp_ds;  	} -	spin_unlock(&nfs4_ds_cache_lock); +	spin_unlock(&nn->nfs4_data_server_lock);  out:  	return ds;  } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 792d3fed1b45..731a88f6313e 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -4,6 +4,7 @@ config NFSD  	depends on INET  	depends on FILE_LOCKING  	depends on FSNOTIFY +	select CRC32  	select LOCKD  	select SUNRPC  	select EXPORTFS diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2041268b398a..59a693f22452 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5430,7 +5430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)  	queued = nfsd4_run_cb(&dp->dl_recall);  	WARN_ON_ONCE(!queued);  	if (!queued) -		nfs4_put_stid(&dp->dl_stid); +		refcount_dec(&dp->dl_stid.sc_count);  }  /* Called from break_lease() with flc_lock held. */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 876152a91f12..5103c2f4d225 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -267,7 +267,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1,  	return true;  } -#ifdef CONFIG_CRC32  /**   * knfsd_fh_hash - calculate the crc32 hash for the filehandle   * @fh - pointer to filehandle @@ -279,12 +278,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)  {  	return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);  } -#else -static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) -{ -	return 0; -} -#endif  /**   * fh_clear_pre_post_attrs - Reset pre/post attributes diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index cb01ea81724d..d0bcf744c553 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -705,8 +705,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)  	int blocksize;  	int err; -	down_write(&nilfs->ns_sem); -  	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);  	if (!blocksize) {  		nilfs_err(sb, "unable to set blocksize"); @@ -779,7 +777,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)  	set_nilfs_init(nilfs);  	err = 0;   out: -	up_write(&nilfs->ns_sem);  	return err;   failed_sbh: diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f2d840ae4ded..87f861e9004f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1961,12 +1961,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,  		return -EINVAL;  	if (mark_cmd == FAN_MARK_FLUSH) { -		if (mark_type == FAN_MARK_MOUNT) -			fsnotify_clear_vfsmount_marks_by_group(group); -		else if (mark_type == FAN_MARK_FILESYSTEM) -			fsnotify_clear_sb_marks_by_group(group); -		else -			fsnotify_clear_inode_marks_by_group(group); +		fsnotify_clear_marks_by_group(group, obj_type);  		return 0;  	} diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b8ac85b548c7..821cb7874685 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6918,6 +6918,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,  		if (IS_ERR(folios[numfolios])) {  			ret = PTR_ERR(folios[numfolios]);  			mlog_errno(ret); +			folios[numfolios] = NULL;  			goto out;  		} diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f1b4b3e611cb..e5f58ff2175f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -174,7 +174,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)  	struct ocfs2_recovery_map *rm;  	mutex_init(&osb->recovery_lock); -	osb->disable_recovery = 0; +	osb->recovery_state = OCFS2_REC_ENABLED;  	osb->recovery_thread_task = NULL;  	init_waitqueue_head(&osb->recovery_event); @@ -190,31 +190,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)  	return 0;  } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */  static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)  { -	mb();  	return osb->recovery_thread_task != NULL;  } -void ocfs2_recovery_exit(struct ocfs2_super *osb) +static void ocfs2_recovery_disable(struct ocfs2_super *osb, +				   enum ocfs2_recovery_state state)  { -	struct ocfs2_recovery_map *rm; - -	/* disable any new recovery threads and wait for any currently -	 * running ones to exit. Do this before setting the vol_state. */  	mutex_lock(&osb->recovery_lock); -	osb->disable_recovery = 1; +	/* +	 * If recovery thread is not running, we can directly transition to +	 * final state. +	 */ +	if (!ocfs2_recovery_thread_running(osb)) { +		osb->recovery_state = state + 1; +		goto out_lock; +	} +	osb->recovery_state = state; +	/* Wait for recovery thread to acknowledge state transition */ +	wait_event_cmd(osb->recovery_event, +		       !ocfs2_recovery_thread_running(osb) || +				osb->recovery_state >= state + 1, +		       mutex_unlock(&osb->recovery_lock), +		       mutex_lock(&osb->recovery_lock)); +out_lock:  	mutex_unlock(&osb->recovery_lock); -	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); -	/* At this point, we know that no more recovery threads can be -	 * launched, so wait for any recovery completion work to -	 * complete. */ +	/* +	 * At this point we know that no more recovery work can be queued so +	 * wait for any recovery completion work to complete. +	 */  	if (osb->ocfs2_wq)  		flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ +	ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ +	struct ocfs2_recovery_map *rm; + +	/* disable any new recovery threads and wait for any currently +	 * running ones to exit. Do this before setting the vol_state. */ +	ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE);  	/*  	 * Now that recovery is shut down, and the osb is about to be @@ -1249,7 +1271,7 @@ static int ocfs2_force_read_journal(struct inode *inode)  		}  		for (i = 0; i < p_blocks; i++, p_blkno++) { -			bh = __find_get_block(osb->sb->s_bdev, p_blkno, +			bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,  					osb->sb->s_blocksize);  			/* block not cached. */  			if (!bh) @@ -1472,6 +1494,18 @@ static int __ocfs2_recovery_thread(void *arg)  		}  	}  restart: +	if (quota_enabled) { +		mutex_lock(&osb->recovery_lock); +		/* Confirm that recovery thread will no longer recover quotas */ +		if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { +			osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; +			wake_up(&osb->recovery_event); +		} +		if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) +			quota_enabled = 0; +		mutex_unlock(&osb->recovery_lock); +	} +  	status = ocfs2_super_lock(osb, 1);  	if (status < 0) {  		mlog_errno(status); @@ -1569,27 +1603,29 @@ bail:  	ocfs2_free_replay_slots(osb);  	osb->recovery_thread_task = NULL; -	mb(); /* sync with ocfs2_recovery_thread_running */ +	if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) +		osb->recovery_state = OCFS2_REC_DISABLED;  	wake_up(&osb->recovery_event);  	mutex_unlock(&osb->recovery_lock); -	if (quota_enabled) -		kfree(rm_quota); +	kfree(rm_quota);  	return status;  }  void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)  { +	int was_set = -1; +  	mutex_lock(&osb->recovery_lock); +	if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) +		was_set = ocfs2_recovery_map_set(osb, node_num);  	trace_ocfs2_recovery_thread(node_num, osb->node_num, -		osb->disable_recovery, osb->recovery_thread_task, -		osb->disable_recovery ? -		-1 : ocfs2_recovery_map_set(osb, node_num)); +		osb->recovery_state, osb->recovery_thread_task, was_set); -	if (osb->disable_recovery) +	if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE)  		goto out;  	if (osb->recovery_thread_task) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index e3c3a35dc5e0..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);  int ocfs2_recovery_init(struct ocfs2_super *osb);  void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb);  int ocfs2_compute_replay_slots(struct ocfs2_super *osb);  void ocfs2_free_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 51c52768132d..6aaa94c554c1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -308,6 +308,21 @@ enum ocfs2_journal_trigger_type {  void ocfs2_initialize_journal_triggers(struct super_block *sb,  				       struct ocfs2_triggers triggers[]); +enum ocfs2_recovery_state { +	OCFS2_REC_ENABLED = 0, +	OCFS2_REC_QUOTA_WANT_DISABLE, +	/* +	 * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for +	 * ocfs2_recovery_disable_quota() to work. +	 */ +	OCFS2_REC_QUOTA_DISABLED, +	OCFS2_REC_WANT_DISABLE, +	/* +	 * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work +	 */ +	OCFS2_REC_DISABLED, +}; +  struct ocfs2_journal;  struct ocfs2_slot_info;  struct ocfs2_recovery_map; @@ -370,7 +385,7 @@ struct ocfs2_super  	struct ocfs2_recovery_map *recovery_map;  	struct ocfs2_replay_map *replay_map;  	struct task_struct *recovery_thread_task; -	int disable_recovery; +	enum ocfs2_recovery_state recovery_state;  	wait_queue_head_t checkpoint_event;  	struct ocfs2_journal *journal;  	unsigned long osb_commit_interval; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2956d888c131..e272429da3db 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out:  /* Sync changes in local quota file into global quota file and   * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */  static int ocfs2_recover_local_quota_file(struct inode *lqinode,  					  int type,  					  struct ocfs2_quota_recovery *rec) @@ -588,7 +587,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,  {  	unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,  					      LOCAL_GROUP_QUOTA_SYSTEM_INODE }; -	struct super_block *sb = osb->sb;  	struct ocfs2_local_disk_dqinfo *ldinfo;  	struct buffer_head *bh;  	handle_t *handle; @@ -600,7 +598,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,  	printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "  	       "slot %u\n", osb->dev_str, slot_num); -	down_read(&sb->s_umount);  	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {  		if (list_empty(&(rec->r_list[type])))  			continue; @@ -677,7 +674,6 @@ out_put:  			break;  	}  out: -	up_read(&sb->s_umount);  	kfree(rec);  	return status;  } @@ -843,8 +839,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)  	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);  	/* -	 * s_umount held in exclusive mode protects us against racing with -	 * recovery thread... +	 * ocfs2_dismount_volume() has already aborted quota recovery...  	 */  	if (oinfo->dqi_rec) {  		ocfs2_free_quota_recovery(oinfo->dqi_rec); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f7b483f0de2a..6ac4dcd54588 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -698,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,  	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,  					       ac, cl); -	if (PTR_ERR(bg_bh) == -ENOSPC) +	if (PTR_ERR(bg_bh) == -ENOSPC) { +		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;  		bg_bh = ocfs2_block_group_alloc_discontig(handle,  							  alloc_inode,  							  ac, cl); +	}  	if (IS_ERR(bg_bh)) {  		status = PTR_ERR(bg_bh);  		bg_bh = NULL; @@ -1794,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,  {  	int status;  	u16 chain; +	u32 contig_bits;  	u64 next_group;  	struct inode *alloc_inode = ac->ac_inode;  	struct buffer_head *group_bh = NULL; @@ -1819,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,  	status = -ENOSPC;  	/* for now, the chain search is a bit simplistic. We just use  	 * the 1st group with any empty bits. */ -	while ((status = ac->ac_group_search(alloc_inode, group_bh, -					     bits_wanted, min_bits, -					     ac->ac_max_block, -					     res)) == -ENOSPC) { +	while (1) { +		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { +			contig_bits = le16_to_cpu(bg->bg_contig_free_bits); +			if (!contig_bits) +				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, +						le16_to_cpu(bg->bg_bits), 0); +			if (bits_wanted > contig_bits && contig_bits >= min_bits) +				bits_wanted = contig_bits; +		} + +		status = ac->ac_group_search(alloc_inode, group_bh, +				bits_wanted, min_bits, +				ac->ac_max_block, res); +		if (status != -ENOSPC) +			break;  		if (!bg->bg_next_group)  			break; @@ -1982,6 +1996,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,  	victim = ocfs2_find_victim_chain(cl);  	ac->ac_chain = victim; +search:  	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,  				    res, &bits_left);  	if (!status) { @@ -2022,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,  		}  	} +	/* Chains can't supply the bits_wanted contiguous space. +	 * We should switch to using every single bit when allocating +	 * from the global bitmap. */ +	if (i == le16_to_cpu(cl->cl_next_free_rec) && +	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { +		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; +		ac->ac_chain = victim; +		goto search; +	} +  set_hint:  	if (status != -ENOSPC) {  		/* If the next search of this group is not likely to @@ -2365,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle,  	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);  	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL -	       && ac->ac_which != OCFS2_AC_USE_MAIN); +	       && ac->ac_which != OCFS2_AC_USE_MAIN +	       && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);  	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {  		WARN_ON(min_clusters > 1); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b481b834857d..bcf2ed4a8631 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -29,6 +29,7 @@ struct ocfs2_alloc_context {  #define OCFS2_AC_USE_MAIN  2  #define OCFS2_AC_USE_INODE 3  #define OCFS2_AC_USE_META  4 +#define OCFS2_AC_USE_MAIN_DISCONTIG  5  	u32    ac_which;  	/* these are used by the chain search */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8bb5022f3082..3d2533950bae 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1812,6 +1812,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)  	/* Orphan scan should be stopped as early as possible */  	ocfs2_orphan_scan_stop(osb); +	/* Stop quota recovery so that we can disable quotas */ +	ocfs2_recovery_disable_quota(osb); +  	ocfs2_disable_quotas(osb);  	/* All dquots should be freed by now */ diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6f2f8f4cfbbc..aef942a758ce 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -541,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,  bool ovl_is_metacopy_dentry(struct dentry *dentry);  char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);  int ovl_ensure_verity_loaded(struct path *path); -int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, -			 u8 *digest_buf, int *buf_length);  int ovl_validate_verity(struct ovl_fs *ofs,  			struct path *metapath,  			struct path *datapath); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b63474d1b064..e19940d649ca 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1138,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,  		return ERR_PTR(-EINVAL);  	} +	if (ctx->nr == ctx->nr_data) { +		pr_err("at least one non-data lowerdir is required\n"); +		return ERR_PTR(-EINVAL); +	} +  	err = -EINVAL;  	for (i = 0; i < ctx->nr; i++) {  		l = &ctx->lower[i]; diff --git a/fs/pnode.c b/fs/pnode.c index 7a062a5de10e..fb77427df39e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m,  					 struct mount *origin)  {  	/* are there any slaves of this mount? */ -	if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))  		return first_slave(m);  	while (1) { @@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m,  	 * Advance m such that propagation_next will not return  	 * the slaves of m.  	 */ -	if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))  		m = last_slave(m);  	return m; @@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)  	while (1) {  		while (1) {  			struct mount *next; -			if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) +			if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))  				return first_slave(m);  			next = next_peer(m);  			if (m->mnt_group_id == origin->mnt_group_id) { @@ -226,11 +226,15 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)  	struct mount *child;  	int type;  	/* skip ones added by this propagate_mnt() */ -	if (IS_MNT_PROPAGATED(m)) +	if (IS_MNT_NEW(m))  		return 0; -	/* skip if mountpoint isn't covered by it */ +	/* skip if mountpoint isn't visible in m */  	if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))  		return 0; +	/* skip if m is in the anon_ns we are emptying */ +	if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING) +		return 0; +  	if (peers(m, last_dest)) {  		type = CL_MAKE_SHARED;  	} else { @@ -380,9 +384,6 @@ bool propagation_would_overmount(const struct mount *from,  	if (!IS_MNT_SHARED(from))  		return false; -	if (IS_MNT_PROPAGATED(to)) -		return false; -  	if (to->mnt.mnt_root != mp->m_dentry)  		return false; diff --git a/fs/pnode.h b/fs/pnode.h index ddafe0d087ca..34b6247af01d 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@  #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)  #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING)) +#define IS_MNT_NEW(m) (!(m)->mnt_ns)  #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)  #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)  #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe738623cf1b..240d82c6f908 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -29,7 +29,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,  {  	struct cached_fid *cfid; -	spin_lock(&cfids->cfid_list_lock);  	list_for_each_entry(cfid, &cfids->entries, entry) {  		if (!strcmp(cfid->path, path)) {  			/* @@ -38,25 +37,20 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,  			 * being deleted due to a lease break.  			 */  			if (!cfid->time || !cfid->has_lease) { -				spin_unlock(&cfids->cfid_list_lock);  				return NULL;  			}  			kref_get(&cfid->refcount); -			spin_unlock(&cfids->cfid_list_lock);  			return cfid;  		}  	}  	if (lookup_only) { -		spin_unlock(&cfids->cfid_list_lock);  		return NULL;  	}  	if (cfids->num_entries >= max_cached_dirs) { -		spin_unlock(&cfids->cfid_list_lock);  		return NULL;  	}  	cfid = init_cached_dir(path);  	if (cfid == NULL) { -		spin_unlock(&cfids->cfid_list_lock);  		return NULL;  	}  	cfid->cfids = cfids; @@ -74,7 +68,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,  	 */  	cfid->has_lease = true; -	spin_unlock(&cfids->cfid_list_lock);  	return cfid;  } @@ -187,8 +180,10 @@ replay_again:  	if (!utf16_path)  		return -ENOMEM; +	spin_lock(&cfids->cfid_list_lock);  	cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs);  	if (cfid == NULL) { +		spin_unlock(&cfids->cfid_list_lock);  		kfree(utf16_path);  		return -ENOENT;  	} @@ -197,7 +192,6 @@ replay_again:  	 * Otherwise, it is either a new entry or laundromat worker removed it  	 * from @cfids->entries.  Caller will put last reference if the latter.  	 */ -	spin_lock(&cfids->cfid_list_lock);  	if (cfid->has_lease && cfid->time) {  		spin_unlock(&cfids->cfid_list_lock);  		*ret_cfid = cfid; diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index e69968e88fe7..35892df7335c 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -704,18 +704,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)  	cifs_free_hash(&server->secmech.md5);  	cifs_free_hash(&server->secmech.sha512); -	if (!SERVER_IS_CHAN(server)) { -		if (server->secmech.enc) { -			crypto_free_aead(server->secmech.enc); -			server->secmech.enc = NULL; -		} - -		if (server->secmech.dec) { -			crypto_free_aead(server->secmech.dec); -			server->secmech.dec = NULL; -		} -	} else { +	if (server->secmech.enc) { +		crypto_free_aead(server->secmech.enc);  		server->secmech.enc = NULL; +	} +	if (server->secmech.dec) { +		crypto_free_aead(server->secmech.dec);  		server->secmech.dec = NULL;  	}  } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 07c4688ec4c9..3b32116b0b49 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -625,10 +625,8 @@ struct smb_version_operations {  	bool (*is_status_io_timeout)(char *buf);  	/* Check for STATUS_NETWORK_NAME_DELETED */  	bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); -	int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, -				   const char *full_path, -				   struct kvec *rsp_iov, -				   struct cifs_open_info_data *data); +	struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov, +								 u32 *plen);  	int (*create_reparse_symlink)(const unsigned int xid,  				      struct inode *inode,  				      struct dentry *dentry, diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 48d0d6f439cf..1b79fe07476f 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1266,10 +1266,9 @@ typedef struct smb_com_query_information_rsp {  typedef struct smb_com_setattr_req {  	struct smb_hdr hdr; /* wct = 8 */  	__le16 attr; -	__le16 time_low; -	__le16 time_high; +	__le32 last_write_time;  	__le16 reserved[5]; /* must be zero */ -	__u16  ByteCount; +	__le16 ByteCount;  	__u8   BufferFormat; /* 4 = ASCII */  	unsigned char fileName[];  } __attribute__((packed)) SETATTR_REQ; @@ -2256,6 +2255,8 @@ typedef struct {  #define FILE_SUPPORTS_ENCRYPTION	0x00020000  #define FILE_SUPPORTS_OBJECT_IDS	0x00010000  #define FILE_VOLUME_IS_COMPRESSED	0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO  0x00000200  #define FILE_SUPPORTS_REMOTE_STORAGE	0x00000100  #define FILE_SUPPORTS_REPARSE_POINTS	0x00000080  #define FILE_SUPPORTS_SPARSE_FILES	0x00000040 diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index cfcc07905bdf..ecf774a8f1ca 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -163,6 +163,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,  extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);  extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,  				  struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, +				  struct file *file);  extern unsigned int smbCalcSize(void *buf);  extern int decode_negTokenInit(unsigned char *security_blob, int length,  			struct TCP_Server_Info *server); @@ -393,6 +395,10 @@ extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon);  extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,  			struct kstatfs *FSData); +extern int SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, +			     const char *fileName, __le32 attributes, __le64 write_time, +			     const struct nls_table *nls_codepage, +			     struct cifs_sb_info *cifs_sb);  extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,  			const char *fileName, const FILE_BASIC_INFO *data,  			const struct nls_table *nls_codepage, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 60cb264a01e5..f55457b4b82e 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -5171,6 +5171,63 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,  	return rc;  } +int +SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, +		  const char *fileName, __le32 attributes, __le64 write_time, +		  const struct nls_table *nls_codepage, +		  struct cifs_sb_info *cifs_sb) +{ +	SETATTR_REQ *pSMB; +	SETATTR_RSP *pSMBr; +	struct timespec64 ts; +	int bytes_returned; +	int name_len; +	int rc; + +	cifs_dbg(FYI, "In %s path %s\n", __func__, fileName); + +retry: +	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, +		      (void **) &pSMBr); +	if (rc) +		return rc; + +	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { +		name_len = +			cifsConvertToUTF16((__le16 *) pSMB->fileName, +					   fileName, PATH_MAX, nls_codepage, +					   cifs_remap(cifs_sb)); +		name_len++;     /* trailing null */ +		name_len *= 2; +	} else { +		name_len = copy_path_name(pSMB->fileName, fileName); +	} +	/* Only few attributes can be set by this command, others are not accepted by Win9x. */ +	pSMB->attr = cpu_to_le16(le32_to_cpu(attributes) & +			(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_ARCHIVE)); +	/* Zero write time value (in both NT and SETATTR formats) means to not change it. */ +	if (le64_to_cpu(write_time) != 0) { +		ts = cifs_NTtimeToUnix(write_time); +		pSMB->last_write_time = cpu_to_le32(ts.tv_sec); +	} +	pSMB->BufferFormat = 0x04; +	name_len++; /* account for buffer type byte */ +	inc_rfc1001_len(pSMB, (__u16)name_len); +	pSMB->ByteCount = cpu_to_le16(name_len); + +	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, +			 (struct smb_hdr *) pSMBr, &bytes_returned, 0); +	if (rc) +		cifs_dbg(FYI, "Send error in %s = %d\n", __func__, rc); + +	cifs_buf_release(pSMB); + +	if (rc == -EAGAIN) +		goto retry; + +	return rc; +} +  /* Some legacy servers such as NT4 require that the file times be set on     an open handle, rather than by pathname - this is awkward due to     potential access conflicts on the open, but it is unavoidable for these diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index f298e86a3c1f..6bf04d9a5491 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -300,7 +300,6 @@ cifs_abort_connection(struct TCP_Server_Info *server)  			 server->ssocket->flags);  		sock_release(server->ssocket);  		server->ssocket = NULL; -		put_net(cifs_net_ns(server));  	}  	server->sequence_number = 0;  	server->session_estab = false; @@ -1074,13 +1073,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server)  	msleep(125);  	if (cifs_rdma_enabled(server))  		smbd_destroy(server); -  	if (server->ssocket) {  		sock_release(server->ssocket);  		server->ssocket = NULL; - -		/* Release netns reference for the socket. */ -		put_net(cifs_net_ns(server));  	}  	if (!list_empty(&server->pending_mid_q)) { @@ -1128,7 +1123,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server)  		 */  	} -	/* Release netns reference for this server. */  	put_net(cifs_net_ns(server));  	kfree(server->leaf_fullpath);  	kfree(server->hostname); @@ -1774,8 +1768,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,  	tcp_ses->ops = ctx->ops;  	tcp_ses->vals = ctx->vals; - -	/* Grab netns reference for this server. */  	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));  	tcp_ses->sign = ctx->sign; @@ -1903,7 +1895,6 @@ smbd_connected:  out_err_crypto_release:  	cifs_crypto_secmech_release(tcp_ses); -	/* Release netns reference for this server. */  	put_net(cifs_net_ns(tcp_ses));  out_err: @@ -1912,10 +1903,8 @@ out_err:  			cifs_put_tcp_session(tcp_ses->primary_server, false);  		kfree(tcp_ses->hostname);  		kfree(tcp_ses->leaf_fullpath); -		if (tcp_ses->ssocket) { +		if (tcp_ses->ssocket)  			sock_release(tcp_ses->ssocket); -			put_net(cifs_net_ns(tcp_ses)); -		}  		kfree(tcp_ses);  	}  	return ERR_PTR(rc); @@ -2556,6 +2545,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)  		return 0;  	if (tcon->nodelete != ctx->nodelete)  		return 0; +	if (tcon->posix_extensions != ctx->linux_ext) +		return 0;  	return 1;  } @@ -3357,24 +3348,20 @@ generic_ip_connect(struct TCP_Server_Info *server)  		socket = server->ssocket;  	} else {  		struct net *net = cifs_net_ns(server); +		struct sock *sk; -		rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); +		rc = __sock_create(net, sfamily, SOCK_STREAM, +				   IPPROTO_TCP, &server->ssocket, 1);  		if (rc < 0) {  			cifs_server_dbg(VFS, "Error %d creating socket\n", rc);  			return rc;  		} -		/* -		 * Grab netns reference for the socket. -		 * -		 * This reference will be released in several situations: -		 * - In the failure path before the cifsd thread is started. -		 * - In the all place where server->socket is released, it is -		 *   also set to NULL. -		 * - Ultimately in clean_demultiplex_info(), during the final -		 *   teardown. -		 */ -		get_net(net); +		sk = server->ssocket->sk; +		__netns_tracker_free(net, &sk->ns_tracker, false); +		sk->sk_net_refcnt = 1; +		get_net_track(net, &sk->ns_tracker, GFP_KERNEL); +		sock_inuse_add(net, 1);  		/* BB other socket options to set KEEPALIVE, NODELAY? */  		cifs_dbg(FYI, "Socket created\n"); @@ -3426,7 +3413,6 @@ generic_ip_connect(struct TCP_Server_Info *server)  	if (rc < 0) {  		cifs_dbg(FYI, "Error %d connecting to server\n", rc);  		trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); -		put_net(cifs_net_ns(server));  		sock_release(socket);  		server->ssocket = NULL;  		return rc; @@ -3767,28 +3753,7 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)  		}  	} -	/* -	 * Clamp the rsize/wsize mount arguments if they are too big for the server -	 * and set the rsize/wsize to the negotiated values if not passed in by -	 * the user on mount -	 */ -	if ((cifs_sb->ctx->wsize == 0) || -	    (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) { -		cifs_sb->ctx->wsize = -			round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); -		/* -		 * in the very unlikely event that the server sent a max write size under PAGE_SIZE, -		 * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096 -		 */ -		if (cifs_sb->ctx->wsize == 0) { -			cifs_sb->ctx->wsize = PAGE_SIZE; -			cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n"); -		} -	} -	if ((cifs_sb->ctx->rsize == 0) || -	    (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) -		cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); - +	cifs_negotiate_iosize(server, cifs_sb->ctx, tcon);  	/*  	 * The cookie is initialized from volume info returned above.  	 * Inside cifs_fscache_get_super_cookie it checks diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8407fb108664..950aa4f912f5 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -160,10 +160,10 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)  	server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);  	rdata->server = server; -	if (cifs_sb->ctx->rsize == 0) -		cifs_sb->ctx->rsize = -			server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), -						     cifs_sb->ctx); +	if (cifs_sb->ctx->rsize == 0) { +		cifs_negotiate_rsize(server, cifs_sb->ctx, +				     tlink_tcon(req->cfile->tlink)); +	}  	rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,  					   &size, &rdata->credits); @@ -1007,6 +1007,11 @@ int cifs_open(struct inode *inode, struct file *file)  		} else {  			_cifsFileInfo_put(cfile, true, false);  		} +	} else { +		/* hard link on the defeered close file */ +		rc = cifs_get_hardlink_path(tcon, inode, file); +		if (rc) +			cifs_close_deferred_file(CIFS_I(inode));  	}  	if (server->oplocks) @@ -2071,6 +2076,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest)  		list_move(li, dest);  } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, +				struct file *file) +{ +	struct cifsFileInfo *open_file = NULL; +	struct cifsInodeInfo *cinode = CIFS_I(inode); +	int rc = 0; + +	spin_lock(&tcon->open_file_lock); +	spin_lock(&cinode->open_file_lock); + +	list_for_each_entry(open_file, &cinode->openFileList, flist) { +		if (file->f_flags == open_file->f_flags) { +			rc = -EINVAL; +			break; +		} +	} + +	spin_unlock(&cinode->open_file_lock); +	spin_unlock(&tcon->open_file_lock); +	return rc; +} +  void  cifs_free_llist(struct list_head *llist)  { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 2980941b9667..a634a34d4086 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1021,6 +1021,7 @@ static int smb3_reconfigure(struct fs_context *fc)  	struct dentry *root = fc->root;  	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);  	struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; +	unsigned int rsize = ctx->rsize, wsize = ctx->wsize;  	char *new_password = NULL, *new_password2 = NULL;  	bool need_recon = false;  	int rc; @@ -1103,11 +1104,8 @@ static int smb3_reconfigure(struct fs_context *fc)  	STEAL_STRING(cifs_sb, ctx, iocharset);  	/* if rsize or wsize not passed in on remount, use previous values */ -	if (ctx->rsize == 0) -		ctx->rsize = cifs_sb->ctx->rsize; -	if (ctx->wsize == 0) -		ctx->wsize = cifs_sb->ctx->wsize; - +	ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; +	ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize;  	smb3_cleanup_fs_context_contents(cifs_sb->ctx);  	rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); @@ -1312,7 +1310,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,  				__func__);  			goto cifs_parse_mount_err;  		} -		ctx->bsize = result.uint_32; +		ctx->bsize = CIFS_ALIGN_BSIZE(fc, result.uint_32);  		ctx->got_bsize = true;  		break;  	case Opt_rasize: @@ -1336,24 +1334,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,  		ctx->rasize = result.uint_32;  		break;  	case Opt_rsize: -		ctx->rsize = result.uint_32; +		ctx->rsize = CIFS_ALIGN_RSIZE(fc, result.uint_32);  		ctx->got_rsize = true;  		ctx->vol_rsize = ctx->rsize;  		break;  	case Opt_wsize: -		ctx->wsize = result.uint_32; +		ctx->wsize = CIFS_ALIGN_WSIZE(fc, result.uint_32);  		ctx->got_wsize = true; -		if (ctx->wsize % PAGE_SIZE != 0) { -			ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); -			if (ctx->wsize == 0) { -				ctx->wsize = PAGE_SIZE; -				cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); -			} else { -				cifs_dbg(VFS, -					 "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", -					 ctx->wsize, PAGE_SIZE); -			} -		}  		ctx->vol_wsize = ctx->wsize;  		break;  	case Opt_acregmax: diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index d1d29249bcdb..9e83302ce4b8 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -20,6 +20,21 @@  		cifs_dbg(VFS, fmt, ## __VA_ARGS__);	\  	} while (0) +static inline size_t cifs_io_align(struct fs_context *fc, +				   const char *name, size_t size) +{ +	if (!size || !IS_ALIGNED(size, PAGE_SIZE)) { +		cifs_errorf(fc, "unaligned %s, making it a multiple of %lu bytes\n", +			    name, PAGE_SIZE); +		size = umax(round_down(size, PAGE_SIZE), PAGE_SIZE); +	} +	return size; +} + +#define CIFS_ALIGN_WSIZE(_fc, _size) cifs_io_align(_fc, "wsize", _size) +#define CIFS_ALIGN_RSIZE(_fc, _size) cifs_io_align(_fc, "rsize", _size) +#define CIFS_ALIGN_BSIZE(_fc, _size) cifs_io_align(_fc, "bsize", _size) +  enum smb_version {  	Smb_1 = 1,  	Smb_20, @@ -361,4 +376,36 @@ static inline void cifs_mount_unlock(void)  	mutex_unlock(&cifs_mount_mutex);  } +static inline void cifs_negotiate_rsize(struct TCP_Server_Info *server, +					struct smb3_fs_context *ctx, +					struct cifs_tcon *tcon) +{ +	unsigned int size; + +	size = umax(server->ops->negotiate_rsize(tcon, ctx), PAGE_SIZE); +	if (ctx->rsize) +		size = umax(umin(ctx->rsize, size), PAGE_SIZE); +	ctx->rsize = round_down(size, PAGE_SIZE); +} + +static inline void cifs_negotiate_wsize(struct TCP_Server_Info *server, +					struct smb3_fs_context *ctx, +					struct cifs_tcon *tcon) +{ +	unsigned int size; + +	size = umax(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); +	if (ctx->wsize) +		size = umax(umin(ctx->wsize, size), PAGE_SIZE); +	ctx->wsize = round_down(size, PAGE_SIZE); +} + +static inline void cifs_negotiate_iosize(struct TCP_Server_Info *server, +					 struct smb3_fs_context *ctx, +					 struct cifs_tcon *tcon) +{ +	cifs_negotiate_rsize(server, ctx, tcon); +	cifs_negotiate_wsize(server, ctx, tcon); +} +  #endif diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index a00a9d91d0da..75be4b46bc6f 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1203,18 +1203,17 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,  			goto out;  		}  		break; -	case IO_REPARSE_TAG_MOUNT_POINT: -		cifs_create_junction_fattr(fattr, sb); -		rc = 0; -		goto out;  	default:  		/* Check for cached reparse point data */  		if (data->symlink_target || data->reparse.buf) {  			rc = 0; -		} else if (iov && server->ops->parse_reparse_point) { -			rc = server->ops->parse_reparse_point(cifs_sb, -							      full_path, -							      iov, data); +		} else if (iov && server->ops->get_reparse_point_buffer) { +			struct reparse_data_buffer *reparse_buf; +			u32 reparse_len; + +			reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len); +			rc = parse_reparse_point(reparse_buf, reparse_len, +						 cifs_sb, full_path, data);  			/*  			 * If the reparse point was not handled but it is the  			 * name surrogate which points to directory, then treat @@ -1228,6 +1227,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,  				cifs_create_junction_fattr(fattr, sb);  				goto out;  			} +			/* +			 * If the reparse point is unsupported by the Linux SMB +			 * client then let it process by the SMB server. So mask +			 * the -EOPNOTSUPP error code. This will allow Linux SMB +			 * client to send SMB OPEN request to server. If server +			 * does not support this reparse point too then server +			 * will return error during open the path. +			 */ +			if (rc == -EOPNOTSUPP) +				rc = 0;  		}  		if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 2b9e9885dc42..bb25e77c5540 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -542,12 +542,12 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,  			kfree(symname_utf16);  			return -ENOMEM;  		} -		/* Flag 0x02000000 is unknown, but all wsl symlinks have this value */ -		symlink_buf->Flags = cpu_to_le32(0x02000000); -		/* PathBuffer is in UTF-8 but without trailing null-term byte */ +		/* Version field must be set to 2 (MS-FSCC 2.1.2.7) */ +		symlink_buf->Version = cpu_to_le32(2); +		/* Target for Version 2 is in UTF-8 but without trailing null-term byte */  		symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,  						   UTF16_LITTLE_ENDIAN, -						   symlink_buf->PathBuffer, +						   symlink_buf->Target,  						   symname_utf8_maxlen);  		*buf = (struct reparse_data_buffer *)symlink_buf;  		buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len; @@ -1016,29 +1016,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf  				     struct cifs_open_info_data *data)  {  	int len = le16_to_cpu(buf->ReparseDataLength); +	int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version);  	int symname_utf8_len;  	__le16 *symname_utf16;  	int symname_utf16_len; -	if (len <= sizeof(buf->Flags)) { +	if (len <= data_offset) {  		cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n");  		return -EIO;  	} -	/* PathBuffer is in UTF-8 but without trailing null-term byte */ -	symname_utf8_len = len - sizeof(buf->Flags); +	/* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */ +	if (le32_to_cpu(buf->Version) != 2) { +		cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version)); +		return -EIO; +	} + +	/* Target for Version 2 is in UTF-8 but without trailing null-term byte */ +	symname_utf8_len = len - data_offset;  	/*  	 * Check that buffer does not contain null byte  	 * because Linux cannot process symlink with null byte.  	 */ -	if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) { +	if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) {  		cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n");  		return -EIO;  	}  	symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL);  	if (!symname_utf16)  		return -ENOMEM; -	symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len, +	symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len,  					    UTF16_LITTLE_ENDIAN,  					    (wchar_t *) symname_utf16, symname_utf8_len * 2);  	if (symname_utf16_len < 0) { @@ -1062,8 +1069,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf,  			const char *full_path,  			struct cifs_open_info_data *data)  { -	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); -  	data->reparse.buf = buf;  	/* See MS-FSCC 2.1.2 */ @@ -1090,24 +1095,17 @@ int parse_reparse_point(struct reparse_data_buffer *buf,  		}  		return 0;  	default: -		cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", -			      le32_to_cpu(buf->ReparseTag));  		return -EOPNOTSUPP;  	}  } -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, -			     const char *full_path, -			     struct kvec *rsp_iov, -			     struct cifs_open_info_data *data) +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, +							  u32 *plen)  { -	struct reparse_data_buffer *buf;  	struct smb2_ioctl_rsp *io = rsp_iov->iov_base; -	u32 plen = le32_to_cpu(io->OutputCount); - -	buf = (struct reparse_data_buffer *)((u8 *)io + -					     le32_to_cpu(io->OutputOffset)); -	return parse_reparse_point(buf, plen, cifs_sb, full_path, data); +	*plen = le32_to_cpu(io->OutputCount); +	return (struct reparse_data_buffer *)((u8 *)io + +					      le32_to_cpu(io->OutputOffset));  }  static bool wsl_to_fattr(struct cifs_open_info_data *data, @@ -1233,16 +1231,6 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,  	bool ok;  	switch (tag) { -	case IO_REPARSE_TAG_INTERNAL: -		if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) -			return false; -		fallthrough; -	case IO_REPARSE_TAG_DFS: -	case IO_REPARSE_TAG_DFSR: -	case IO_REPARSE_TAG_MOUNT_POINT: -		/* See cifs_create_junction_fattr() */ -		fattr->cf_mode = S_IFDIR | 0711; -		break;  	case IO_REPARSE_TAG_LX_SYMLINK:  	case IO_REPARSE_TAG_LX_FIFO:  	case IO_REPARSE_TAG_AF_UNIX: @@ -1262,7 +1250,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,  		fattr->cf_mode |= S_IFLNK;  		break;  	default: -		return false; +		if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) +			return false; +		if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) && +		    tag != IO_REPARSE_TAG_INTERNAL) +			return false; +		/* See cifs_create_junction_fattr() */ +		fattr->cf_mode = S_IFDIR | 0711; +		break;  	}  	fattr->cf_dtype = S_DT(fattr->cf_mode); diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index c0be5ab45a78..08de853b36a8 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -135,9 +135,6 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,  int smb2_mknod_reparse(unsigned int xid, struct inode *inode,  		       struct dentry *dentry, struct cifs_tcon *tcon,  		       const char *full_path, umode_t mode, dev_t dev); -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, -			     const char *full_path, -			     struct kvec *rsp_iov, -			     struct cifs_open_info_data *data); +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len);  #endif /* _CIFS_REPARSE_H */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index f2ca5963cd9d..b3fa9ee26912 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -680,6 +680,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)  	*pbcc_area = bcc_ptr;  } +static void +ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ +	char *bcc_ptr = *pbcc_area; + +	strcpy(bcc_ptr, "Linux version "); +	bcc_ptr += strlen("Linux version "); +	strcpy(bcc_ptr, init_utsname()->release); +	bcc_ptr += strlen(init_utsname()->release) + 1; + +	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); +	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + +	*pbcc_area = bcc_ptr; +} +  static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,  				   const struct nls_table *nls_cp)  { @@ -704,6 +720,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,  	*pbcc_area = bcc_ptr;  } +static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses, +				const struct nls_table *nls_cp) +{ +	char *bcc_ptr = *pbcc_area; +	int len; + +	/* copy domain */ +	if (ses->domainName != NULL) { +		len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); +		if (WARN_ON_ONCE(len < 0)) +			len = CIFS_MAX_DOMAINNAME_LEN - 1; +		bcc_ptr += len; +	} /* else we send a null domain name so server will default to its own domain */ +	*bcc_ptr = 0; +	bcc_ptr++; + +	*pbcc_area = bcc_ptr; +} +  static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,  				   const struct nls_table *nls_cp)  { @@ -749,25 +784,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,  	*bcc_ptr = 0;  	bcc_ptr++; /* account for null termination */ -	/* copy domain */ -	if (ses->domainName != NULL) { -		len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); -		if (WARN_ON_ONCE(len < 0)) -			len = CIFS_MAX_DOMAINNAME_LEN - 1; -		bcc_ptr += len; -	} /* else we send a null domain name so server will default to its own domain */ -	*bcc_ptr = 0; -	bcc_ptr++; -  	/* BB check for overflow here */ -	strcpy(bcc_ptr, "Linux version "); -	bcc_ptr += strlen("Linux version "); -	strcpy(bcc_ptr, init_utsname()->release); -	bcc_ptr += strlen(init_utsname()->release) + 1; - -	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); -	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; +	ascii_domain_string(&bcc_ptr, ses, nls_cp); +	ascii_oslm_strings(&bcc_ptr, nls_cp);  	*pbcc_area = bcc_ptr;  } @@ -1570,7 +1590,7 @@ sess_auth_kerberos(struct sess_data *sess_data)  	sess_data->iov[1].iov_len = msg->secblob_len;  	pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); -	if (ses->capabilities & CAP_UNICODE) { +	if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {  		/* unicode strings must be word aligned */  		if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {  			*bcc_ptr = 0; @@ -1579,8 +1599,8 @@ sess_auth_kerberos(struct sess_data *sess_data)  		unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);  		unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);  	} else { -		/* BB: is this right? */ -		ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); +		ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); +		ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp);  	}  	sess_data->iov[2].iov_len = (long) bcc_ptr - diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 26df807fbe7a..b27a182629ec 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -432,7 +432,7 @@ cifs_negotiate(const unsigned int xid,  }  static unsigned int -cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) +smb1_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)  {  	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);  	struct TCP_Server_Info *server = tcon->ses->server; @@ -467,7 +467,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)  }  static unsigned int -cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) +smb1_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)  {  	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);  	struct TCP_Server_Info *server = tcon->ses->server; @@ -543,24 +543,104 @@ static int cifs_query_path_info(const unsigned int xid,  				const char *full_path,  				struct cifs_open_info_data *data)  { -	int rc; +	int rc = -EOPNOTSUPP;  	FILE_ALL_INFO fi = {}; +	struct cifs_search_info search_info = {}; +	bool non_unicode_wildcard = false;  	data->reparse_point = false;  	data->adjust_tz = false; -	/* could do find first instead but this returns more info */ -	rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, -			      cifs_remap(cifs_sb));  	/* -	 * BB optimize code so we do not make the above call when server claims -	 * no NT SMB support and the above call failed at least once - set flag -	 * in tcon or mount. +	 * First try CIFSSMBQPathInfo() function which returns more info +	 * (NumberOfLinks) than CIFSFindFirst() fallback function. +	 * Some servers like Win9x do not support SMB_QUERY_FILE_ALL_INFO over +	 * TRANS2_QUERY_PATH_INFORMATION, but supports it with filehandle over +	 * TRANS2_QUERY_FILE_INFORMATION (function CIFSSMBQFileInfo(). But SMB +	 * Open command on non-NT servers works only for files, does not work +	 * for directories. And moreover Win9x SMB server returns bogus data in +	 * SMB_QUERY_FILE_ALL_INFO Attributes field. So for non-NT servers, +	 * do not even use CIFSSMBQPathInfo() or CIFSSMBQFileInfo() function. +	 */ +	if (tcon->ses->capabilities & CAP_NT_SMBS) +		rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, +				      cifs_sb->local_nls, cifs_remap(cifs_sb)); + +	/* +	 * Non-UNICODE variant of fallback functions below expands wildcards, +	 * so they cannot be used for querying paths with wildcard characters. +	 */ +	if (rc && !(tcon->ses->capabilities & CAP_UNICODE) && strpbrk(full_path, "*?\"><")) +		non_unicode_wildcard = true; + +	/* +	 * Then fallback to CIFSFindFirst() which works also with non-NT servers +	 * but does not does not provide NumberOfLinks. +	 */ +	if ((rc == -EOPNOTSUPP || rc == -EINVAL) && +	    !non_unicode_wildcard) { +		if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) +			search_info.info_level = SMB_FIND_FILE_INFO_STANDARD; +		else +			search_info.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; +		rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb, NULL, +				   CIFS_SEARCH_CLOSE_ALWAYS | CIFS_SEARCH_CLOSE_AT_END, +				   &search_info, false); +		if (rc == 0) { +			if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) { +				FIND_FILE_STANDARD_INFO *di; +				int offset = tcon->ses->server->timeAdj; + +				di = (FIND_FILE_STANDARD_INFO *)search_info.srch_entries_start; +				fi.CreationTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( +						di->CreationDate, di->CreationTime, offset))); +				fi.LastAccessTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( +						di->LastAccessDate, di->LastAccessTime, offset))); +				fi.LastWriteTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( +						di->LastWriteDate, di->LastWriteTime, offset))); +				fi.ChangeTime = fi.LastWriteTime; +				fi.Attributes = cpu_to_le32(le16_to_cpu(di->Attributes)); +				fi.AllocationSize = cpu_to_le64(le32_to_cpu(di->AllocationSize)); +				fi.EndOfFile = cpu_to_le64(le32_to_cpu(di->DataSize)); +			} else { +				FILE_FULL_DIRECTORY_INFO *di; + +				di = (FILE_FULL_DIRECTORY_INFO *)search_info.srch_entries_start; +				fi.CreationTime = di->CreationTime; +				fi.LastAccessTime = di->LastAccessTime; +				fi.LastWriteTime = di->LastWriteTime; +				fi.ChangeTime = di->ChangeTime; +				fi.Attributes = di->ExtFileAttributes; +				fi.AllocationSize = di->AllocationSize; +				fi.EndOfFile = di->EndOfFile; +				fi.EASize = di->EaSize; +			} +			fi.NumberOfLinks = cpu_to_le32(1); +			fi.DeletePending = 0; +			fi.Directory = !!(le32_to_cpu(fi.Attributes) & ATTR_DIRECTORY); +			cifs_buf_release(search_info.ntwrk_buf_start); +		} else if (!full_path[0]) { +			/* +			 * CIFSFindFirst() does not work on root path if the +			 * root path was exported on the server from the top +			 * level path (drive letter). +			 */ +			rc = -EOPNOTSUPP; +		} +	} + +	/* +	 * If everything failed then fallback to the legacy SMB command +	 * SMB_COM_QUERY_INFORMATION which works with all servers, but +	 * provide just few information.  	 */ -	if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { +	if ((rc == -EOPNOTSUPP || rc == -EINVAL) && !non_unicode_wildcard) {  		rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,  					 cifs_remap(cifs_sb));  		data->adjust_tz = true; +	} else if ((rc == -EOPNOTSUPP || rc == -EINVAL) && non_unicode_wildcard) { +		/* Path with non-UNICODE wildcard character cannot exist. */ +		rc = -ENOENT;  	}  	if (!rc) { @@ -568,6 +648,42 @@ static int cifs_query_path_info(const unsigned int xid,  		data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;  	} +#ifdef CONFIG_CIFS_XATTR +	/* +	 * For WSL CHR and BLK reparse points it is required to fetch +	 * EA $LXDEV which contains major and minor device numbers. +	 */ +	if (!rc && data->reparse_point) { +		struct smb2_file_full_ea_info *ea; + +		ea = (struct smb2_file_full_ea_info *)data->wsl.eas; +		rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, +				    &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], +				    SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); +		if (rc == SMB2_WSL_XATTR_DEV_SIZE) { +			ea->next_entry_offset = cpu_to_le32(0); +			ea->flags = 0; +			ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; +			ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); +			memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); +			data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + +					    SMB2_WSL_XATTR_DEV_SIZE; +			rc = 0; +		} else if (rc >= 0) { +			/* It is an error if EA $LXDEV has wrong size. */ +			rc = -EINVAL; +		} else { +			/* +			 * In all other cases ignore error if fetching +			 * of EA $LXDEV failed. It is needed only for +			 * WSL CHR and BLK reparse points and wsl_to_fattr() +			 * handle the case when EA is missing. +			 */ +			rc = 0; +		} +	} +#endif +  	return rc;  } @@ -603,6 +719,13 @@ static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,  	int rc;  	FILE_ALL_INFO fi = {}; +	/* +	 * CIFSSMBQFileInfo() for non-NT servers returns bogus data in +	 * Attributes fields. So do not use this command for non-NT servers. +	 */ +	if (!(tcon->ses->capabilities & CAP_NT_SMBS)) +		return -EOPNOTSUPP; +  	if (cfile->symlink_target) {  		data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);  		if (!data->symlink_target) @@ -773,6 +896,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,  	struct cifs_fid fid;  	struct cifs_open_parms oparms;  	struct cifsFileInfo *open_file; +	FILE_BASIC_INFO new_buf; +	struct cifs_open_info_data query_data; +	__le64 write_time = buf->LastWriteTime;  	struct cifsInodeInfo *cinode = CIFS_I(inode);  	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);  	struct tcon_link *tlink = NULL; @@ -780,20 +906,58 @@ smb_set_file_info(struct inode *inode, const char *full_path,  	/* if the file is already open for write, just use that fileid */  	open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); +  	if (open_file) {  		fid.netfid = open_file->fid.netfid;  		netpid = open_file->pid;  		tcon = tlink_tcon(open_file->tlink); -		goto set_via_filehandle; +	} else { +		tlink = cifs_sb_tlink(cifs_sb); +		if (IS_ERR(tlink)) { +			rc = PTR_ERR(tlink); +			tlink = NULL; +			goto out; +		} +		tcon = tlink_tcon(tlink);  	} -	tlink = cifs_sb_tlink(cifs_sb); -	if (IS_ERR(tlink)) { -		rc = PTR_ERR(tlink); -		tlink = NULL; -		goto out; +	/* +	 * Non-NT servers interprets zero time value in SMB_SET_FILE_BASIC_INFO +	 * over TRANS2_SET_FILE_INFORMATION as a valid time value. NT servers +	 * interprets zero time value as do not change existing value on server. +	 * API of ->set_file_info() callback expects that zero time value has +	 * the NT meaning - do not change. Therefore if server is non-NT and +	 * some time values in "buf" are zero, then fetch missing time values. +	 */ +	if (!(tcon->ses->capabilities & CAP_NT_SMBS) && +	    (!buf->CreationTime || !buf->LastAccessTime || +	     !buf->LastWriteTime || !buf->ChangeTime)) { +		rc = cifs_query_path_info(xid, tcon, cifs_sb, full_path, &query_data); +		if (rc) { +			if (open_file) { +				cifsFileInfo_put(open_file); +				open_file = NULL; +			} +			goto out; +		} +		/* +		 * Original write_time from buf->LastWriteTime is preserved +		 * as SMBSetInformation() interprets zero as do not change. +		 */ +		new_buf = *buf; +		buf = &new_buf; +		if (!buf->CreationTime) +			buf->CreationTime = query_data.fi.CreationTime; +		if (!buf->LastAccessTime) +			buf->LastAccessTime = query_data.fi.LastAccessTime; +		if (!buf->LastWriteTime) +			buf->LastWriteTime = query_data.fi.LastWriteTime; +		if (!buf->ChangeTime) +			buf->ChangeTime = query_data.fi.ChangeTime;  	} -	tcon = tlink_tcon(tlink); + +	if (open_file) +		goto set_via_filehandle;  	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,  				cifs_sb); @@ -814,8 +978,45 @@ smb_set_file_info(struct inode *inode, const char *full_path,  		.fid = &fid,  	}; -	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); -	rc = CIFS_open(xid, &oparms, &oplock, NULL); +	if (S_ISDIR(inode->i_mode) && !(tcon->ses->capabilities & CAP_NT_SMBS)) { +		/* Opening directory path is not possible on non-NT servers. */ +		rc = -EOPNOTSUPP; +	} else { +		/* +		 * Use cifs_open_file() instead of CIFS_open() as the +		 * cifs_open_file() selects the correct function which +		 * works also on non-NT servers. +		 */ +		rc = cifs_open_file(xid, &oparms, &oplock, NULL); +		/* +		 * Opening path for writing on non-NT servers is not +		 * possible when the read-only attribute is already set. +		 * Non-NT server in this case returns -EACCES. For those +		 * servers the only possible way how to clear the read-only +		 * bit is via SMB_COM_SETATTR command. +		 */ +		if (rc == -EACCES && +		    (cinode->cifsAttrs & ATTR_READONLY) && +		     le32_to_cpu(buf->Attributes) != 0 && /* 0 = do not change attrs */ +		     !(le32_to_cpu(buf->Attributes) & ATTR_READONLY) && +		     !(tcon->ses->capabilities & CAP_NT_SMBS)) +			rc = -EOPNOTSUPP; +	} + +	/* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ +	if (rc == -EOPNOTSUPP) { +		cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); +		rc = SMBSetInformation(xid, tcon, full_path, +				       buf->Attributes != 0 ? buf->Attributes : cpu_to_le32(cinode->cifsAttrs), +				       write_time, +				       cifs_sb->local_nls, cifs_sb); +		if (rc == 0) +			cinode->cifsAttrs = le32_to_cpu(buf->Attributes); +		else +			rc = -EACCES; +		goto out; +	} +  	if (rc != 0) {  		if (rc == -EIO)  			rc = -EINVAL; @@ -823,6 +1024,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,  	}  	netpid = current->tgid; +	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for attrs/times not supported by this server\n");  set_via_filehandle:  	rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); @@ -833,6 +1035,21 @@ set_via_filehandle:  		CIFSSMBClose(xid, tcon, fid.netfid);  	else  		cifsFileInfo_put(open_file); + +	/* +	 * Setting the read-only bit is not honered on non-NT servers when done +	 * via open-semantics. So for setting it, use SMB_COM_SETATTR command. +	 * This command works only after the file is closed, so use it only when +	 * operation was called without the filehandle. +	 */ +	if (open_file == NULL && +	    !(tcon->ses->capabilities & CAP_NT_SMBS) && +	    le32_to_cpu(buf->Attributes) & ATTR_READONLY) { +		SMBSetInformation(xid, tcon, full_path, +				  buf->Attributes, +				  0 /* do not change write time */, +				  cifs_sb->local_nls, cifs_sb); +	}  out:  	if (tlink != NULL)  		cifs_put_tlink(tlink); @@ -970,18 +1187,13 @@ static int cifs_query_symlink(const unsigned int xid,  	return rc;  } -static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, -				    const char *full_path, -				    struct kvec *rsp_iov, -				    struct cifs_open_info_data *data) +static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov, +								 u32 *plen)  { -	struct reparse_data_buffer *buf;  	TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; -	u32 plen = le16_to_cpu(io->ByteCount); - -	buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + -					     le32_to_cpu(io->DataOffset)); -	return parse_reparse_point(buf, plen, cifs_sb, full_path, data); +	*plen = le16_to_cpu(io->ByteCount); +	return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + +					      le32_to_cpu(io->DataOffset));  }  static bool @@ -1130,8 +1342,8 @@ struct smb_version_operations smb1_operations = {  	.check_trans2 = cifs_check_trans2,  	.need_neg = cifs_need_neg,  	.negotiate = cifs_negotiate, -	.negotiate_wsize = cifs_negotiate_wsize, -	.negotiate_rsize = cifs_negotiate_rsize, +	.negotiate_wsize = smb1_negotiate_wsize, +	.negotiate_rsize = smb1_negotiate_rsize,  	.sess_setup = CIFS_SessSetup,  	.logoff = CIFSSMBLogoff,  	.tree_connect = CIFSTCon, @@ -1157,7 +1369,7 @@ struct smb_version_operations smb1_operations = {  	.rename = CIFSSMBRename,  	.create_hardlink = CIFSCreateHardLink,  	.query_symlink = cifs_query_symlink, -	.parse_reparse_point = cifs_parse_reparse_point, +	.get_reparse_point_buffer = cifs_get_reparse_point_buffer,  	.open = cifs_open_file,  	.set_fid = cifs_set_fid,  	.close = cifs_close_file, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 57d9bfbadd97..2a3e46b8e15a 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -666,6 +666,8 @@ finished:  		/* smb2_parse_contexts() fills idata->fi.IndexNumber */  		rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch,  					 oparms->fid->lease_key, &oplock, &idata->fi, NULL); +		if (rc) +			cifs_dbg(VFS, "rc: %d parsing context of compound op\n", rc);  	}  	for (i = 0; i < num_cmds; i++) { diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 41d8cd20b25f..2fe8eeb98535 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4555,9 +4555,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,  			return rc;  		}  	} else { -		if (unlikely(!server->secmech.dec)) -			return -EIO; - +		rc = smb3_crypto_aead_allocate(server); +		if (unlikely(rc)) +			return rc;  		tfm = server->secmech.dec;  	} @@ -5303,7 +5303,7 @@ struct smb_version_operations smb20_operations = {  	.unlink = smb2_unlink,  	.rename = smb2_rename_path,  	.create_hardlink = smb2_create_hardlink, -	.parse_reparse_point = smb2_parse_reparse_point, +	.get_reparse_point_buffer = smb2_get_reparse_point_buffer,  	.query_mf_symlink = smb3_query_mf_symlink,  	.create_mf_symlink = smb3_create_mf_symlink,  	.create_reparse_symlink = smb2_create_reparse_symlink, @@ -5406,7 +5406,7 @@ struct smb_version_operations smb21_operations = {  	.unlink = smb2_unlink,  	.rename = smb2_rename_path,  	.create_hardlink = smb2_create_hardlink, -	.parse_reparse_point = smb2_parse_reparse_point, +	.get_reparse_point_buffer = smb2_get_reparse_point_buffer,  	.query_mf_symlink = smb3_query_mf_symlink,  	.create_mf_symlink = smb3_create_mf_symlink,  	.create_reparse_symlink = smb2_create_reparse_symlink, @@ -5513,7 +5513,7 @@ struct smb_version_operations smb30_operations = {  	.unlink = smb2_unlink,  	.rename = smb2_rename_path,  	.create_hardlink = smb2_create_hardlink, -	.parse_reparse_point = smb2_parse_reparse_point, +	.get_reparse_point_buffer = smb2_get_reparse_point_buffer,  	.query_mf_symlink = smb3_query_mf_symlink,  	.create_mf_symlink = smb3_create_mf_symlink,  	.create_reparse_symlink = smb2_create_reparse_symlink, @@ -5629,7 +5629,7 @@ struct smb_version_operations smb311_operations = {  	.unlink = smb2_unlink,  	.rename = smb2_rename_path,  	.create_hardlink = smb2_create_hardlink, -	.parse_reparse_point = smb2_parse_reparse_point, +	.get_reparse_point_buffer = smb2_get_reparse_point_buffer,  	.query_mf_symlink = smb3_query_mf_symlink,  	.create_mf_symlink = smb3_create_mf_symlink,  	.create_reparse_symlink = smb2_create_reparse_symlink, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 81e05db8e4d5..4e28632b5fd6 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1252,15 +1252,8 @@ SMB2_negotiate(const unsigned int xid,  			cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");  	} -	if (server->cipher_type && !rc) { -		if (!SERVER_IS_CHAN(server)) { -			rc = smb3_crypto_aead_allocate(server); -		} else { -			/* For channels, just reuse the primary server crypto secmech. */ -			server->secmech.enc = server->primary_server->secmech.enc; -			server->secmech.dec = server->primary_server->secmech.dec; -		} -	} +	if (server->cipher_type && !rc) +		rc = smb3_crypto_aead_allocate(server);  neg_exit:  	free_rsp_buf(resp_buftype, rsp);  	return rc; @@ -2928,6 +2921,7 @@ replay_again:  		req->CreateContextsOffset = cpu_to_le32(  			sizeof(struct smb2_create_req) +  			iov[1].iov_len); +		le32_add_cpu(&req->CreateContextsLength, iov[n_iov-1].iov_len);  		pc_buf = iov[n_iov-1].iov_base;  	} @@ -2974,7 +2968,7 @@ replay_again:  	/* Eventually save off posix specific response info and timestamps */  err_free_rsp_buf: -	free_rsp_buf(resp_buftype, rsp); +	free_rsp_buf(resp_buftype, rsp_iov.iov_base);  	kfree(pc_buf);  err_free_req:  	cifs_small_buf_release(req); @@ -4099,12 +4093,8 @@ static void cifs_renegotiate_iosize(struct TCP_Server_Info *server,  		return;  	spin_lock(&tcon->sb_list_lock); -	list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) { -		cifs_sb->ctx->rsize = -			server->ops->negotiate_rsize(tcon, cifs_sb->ctx); -		cifs_sb->ctx->wsize = -			server->ops->negotiate_wsize(tcon, cifs_sb->ctx); -	} +	list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) +		cifs_negotiate_iosize(server, cifs_sb->ctx, tcon);  	spin_unlock(&tcon->sb_list_lock);  } diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 764dca80c15c..f79a5165a7cc 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1567,13 +1567,13 @@ struct reparse_nfs_data_buffer {  	__u8	DataBuffer[];  } __packed; -/* For IO_REPARSE_TAG_LX_SYMLINK */ +/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */  struct reparse_wsl_symlink_data_buffer {  	__le32	ReparseTag;  	__le16	ReparseDataLength;  	__u16	Reserved; -	__le32	Flags; -	__u8	PathBuffer[]; /* Variable Length UTF-8 string without nul-term */ +	__le32	Version; /* Always 2 */ +	__u8	Target[]; /* Variable Length UTF-8 string without nul-term */  } __packed;  struct validate_negotiate_info_req { diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 83caa3849749..b3d121052408 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -550,7 +550,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,  		retval = -ENOMEM;  		goto out;  	} -	sess->user = user; + +	if (!sess->user) { +		/* First successful authentication */ +		sess->user = user; +	} else { +		if (!ksmbd_compare_user(sess->user, user)) { +			ksmbd_debug(AUTH, "different user tried to reuse session\n"); +			retval = -EPERM; +			ksmbd_free_user(user); +			goto out; +		} +		ksmbd_free_user(user); +	}  	memcpy(sess->sess_key, resp->payload, resp->session_key_len);  	memcpy(out_blob, resp->payload + resp->session_key_len, diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c1f22c129111..83764c230e9d 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -39,8 +39,10 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)  	xa_destroy(&conn->sessions);  	kvfree(conn->request_buf);  	kfree(conn->preauth_info); -	if (atomic_dec_and_test(&conn->refcnt)) +	if (atomic_dec_and_test(&conn->refcnt)) { +		ksmbd_free_transport(conn->transport);  		kfree(conn); +	}  }  /** diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 3f45f28f6f0f..9dec4c2940bc 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -59,10 +59,12 @@ static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess)  	struct ksmbd_session_rpc *entry;  	long index; +	down_write(&sess->rpc_lock);  	xa_for_each(&sess->rpc_handle_list, index, entry) {  		xa_erase(&sess->rpc_handle_list, index);  		__session_rpc_close(sess, entry);  	} +	up_write(&sess->rpc_lock);  	xa_destroy(&sess->rpc_handle_list);  } @@ -92,7 +94,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)  {  	struct ksmbd_session_rpc *entry, *old;  	struct ksmbd_rpc_command *resp; -	int method; +	int method, id;  	method = __rpc_method(rpc_name);  	if (!method) @@ -102,26 +104,29 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)  	if (!entry)  		return -ENOMEM; +	down_read(&sess->rpc_lock);  	entry->method = method; -	entry->id = ksmbd_ipc_id_alloc(); -	if (entry->id < 0) +	entry->id = id = ksmbd_ipc_id_alloc(); +	if (id < 0)  		goto free_entry; -	old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP); +	old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP);  	if (xa_is_err(old))  		goto free_id; -	resp = ksmbd_rpc_open(sess, entry->id); +	resp = ksmbd_rpc_open(sess, id);  	if (!resp)  		goto erase_xa; +	up_read(&sess->rpc_lock);  	kvfree(resp); -	return entry->id; +	return id;  erase_xa:  	xa_erase(&sess->rpc_handle_list, entry->id);  free_id:  	ksmbd_rpc_id_free(entry->id);  free_entry:  	kfree(entry); +	up_read(&sess->rpc_lock);  	return -EINVAL;  } @@ -129,9 +134,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)  {  	struct ksmbd_session_rpc *entry; +	down_write(&sess->rpc_lock);  	entry = xa_erase(&sess->rpc_handle_list, id);  	if (entry)  		__session_rpc_close(sess, entry); +	up_write(&sess->rpc_lock);  }  int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) @@ -439,6 +446,7 @@ static struct ksmbd_session *__session_create(int protocol)  	sess->sequence_number = 1;  	rwlock_init(&sess->tree_conns_lock);  	atomic_set(&sess->refcnt, 2); +	init_rwsem(&sess->rpc_lock);  	ret = __init_smb2_session(sess);  	if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f21348381d59..c5749d6ec715 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -63,6 +63,7 @@ struct ksmbd_session {  	rwlock_t			tree_conns_lock;  	atomic_t			refcnt; +	struct rw_semaphore		rpc_lock;  };  static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index f103b1bd0400..03f606afad93 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -129,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo)  	kfree(opinfo);  } -static inline void opinfo_free_rcu(struct rcu_head *rcu_head) -{ -	struct oplock_info *opinfo; - -	opinfo = container_of(rcu_head, struct oplock_info, rcu_head); -	free_opinfo(opinfo); -} -  struct oplock_info *opinfo_get(struct ksmbd_file *fp)  {  	struct oplock_info *opinfo; @@ -157,8 +149,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)  	if (list_empty(&ci->m_op_list))  		return NULL; -	rcu_read_lock(); -	opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, +	down_read(&ci->m_lock); +	opinfo = list_first_entry(&ci->m_op_list, struct oplock_info,  					op_entry);  	if (opinfo) {  		if (opinfo->conn == NULL || @@ -171,8 +163,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)  			}  		}  	} - -	rcu_read_unlock(); +	up_read(&ci->m_lock);  	return opinfo;  } @@ -185,7 +176,7 @@ void opinfo_put(struct oplock_info *opinfo)  	if (!atomic_dec_and_test(&opinfo->refcount))  		return; -	call_rcu(&opinfo->rcu_head, opinfo_free_rcu); +	free_opinfo(opinfo);  }  static void opinfo_add(struct oplock_info *opinfo) @@ -193,7 +184,7 @@ static void opinfo_add(struct oplock_info *opinfo)  	struct ksmbd_inode *ci = opinfo->o_fp->f_ci;  	down_write(&ci->m_lock); -	list_add_rcu(&opinfo->op_entry, &ci->m_op_list); +	list_add(&opinfo->op_entry, &ci->m_op_list);  	up_write(&ci->m_lock);  } @@ -207,7 +198,7 @@ static void opinfo_del(struct oplock_info *opinfo)  		write_unlock(&lease_list_lock);  	}  	down_write(&ci->m_lock); -	list_del_rcu(&opinfo->op_entry); +	list_del(&opinfo->op_entry);  	up_write(&ci->m_lock);  } @@ -1347,8 +1338,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,  	ci = fp->f_ci;  	op = opinfo_get(fp); -	rcu_read_lock(); -	list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { +	down_read(&ci->m_lock); +	list_for_each_entry(brk_op, &ci->m_op_list, op_entry) {  		if (brk_op->conn == NULL)  			continue; @@ -1358,7 +1349,6 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,  		if (ksmbd_conn_releasing(brk_op->conn))  			continue; -		rcu_read_unlock();  		if (brk_op->is_lease && (brk_op->o_lease->state &  		    (~(SMB2_LEASE_READ_CACHING_LE |  				SMB2_LEASE_HANDLE_CACHING_LE)))) { @@ -1388,9 +1378,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,  		oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL);  next:  		opinfo_put(brk_op); -		rcu_read_lock();  	} -	rcu_read_unlock(); +	up_read(&ci->m_lock);  	if (op)  		opinfo_put(op); @@ -1507,7 +1496,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req)  		if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <  		    sizeof(struct create_lease_v2) - 4) -			return NULL; +			goto err_out;  		memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);  		lreq->req_state = lc->lcontext.LeaseState; @@ -1523,7 +1512,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req)  		if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <  		    sizeof(struct create_lease)) -			return NULL; +			goto err_out;  		memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);  		lreq->req_state = lc->lcontext.LeaseState; @@ -1532,6 +1521,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req)  		lreq->version = 1;  	}  	return lreq; +err_out: +	kfree(lreq); +	return NULL;  }  /** diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h index 3f64f0787263..9a56eaadd0dd 100644 --- a/fs/smb/server/oplock.h +++ b/fs/smb/server/oplock.h @@ -71,7 +71,6 @@ struct oplock_info {  	struct list_head        lease_entry;  	wait_queue_head_t oplock_q; /* Other server threads */  	wait_queue_head_t oplock_brk; /* oplock breaking wait */ -	struct rcu_head		rcu_head;  };  struct lease_break_info { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d24d95d15d87..f2a2be8467c6 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)  		return name;  	} +	if (*name == '\0') { +		kfree(name); +		return ERR_PTR(-EINVAL); +	} +  	if (*name == '\\') {  		pr_err("not allow directory name included leading slash\n");  		kfree(name); @@ -1445,7 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work,  {  	struct ksmbd_conn *conn = work->conn;  	struct ksmbd_session *sess = work->sess; -	struct channel *chann = NULL; +	struct channel *chann = NULL, *old;  	struct ksmbd_user *user;  	u64 prev_id;  	int sz, rc; @@ -1557,7 +1562,12 @@ binding_session:  				return -ENOMEM;  			chann->conn = conn; -			xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); +			old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann, +					KSMBD_DEFAULT_GFP); +			if (xa_is_err(old)) { +				kfree(chann); +				return xa_err(old); +			}  		}  	} @@ -1602,9 +1612,6 @@ static int krb5_authenticate(struct ksmbd_work *work,  	if (prev_sess_id && prev_sess_id != sess->id)  		destroy_previous_session(conn, sess->user, prev_sess_id); -	if (sess->state == SMB2_SESSION_VALID) -		ksmbd_free_user(sess->user); -  	retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,  					 out_blob, &out_len);  	if (retval) { @@ -2247,10 +2254,6 @@ int smb2_session_logoff(struct ksmbd_work *work)  	sess->state = SMB2_SESSION_EXPIRED;  	up_write(&conn->session_lock); -	if (sess->user) { -		ksmbd_free_user(sess->user); -		sess->user = NULL; -	}  	ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP);  	rsp->StructureSize = cpu_to_le16(4); diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index a3d8a905b07e..d742ba754348 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -72,6 +72,8 @@  #define FILE_SUPPORTS_ENCRYPTION	0x00020000  #define FILE_SUPPORTS_OBJECT_IDS	0x00010000  #define FILE_VOLUME_IS_COMPRESSED	0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO  0x00000200  #define FILE_SUPPORTS_REMOTE_STORAGE	0x00000100  #define FILE_SUPPORTS_REPARSE_POINTS	0x00000080  #define FILE_SUPPORTS_SPARSE_FILES	0x00000040 diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 3f185ae60dc5..2a3e2b0ce557 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -310,7 +310,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)  	server_conf.signing = req->signing;  	server_conf.tcp_port = req->tcp_port;  	server_conf.ipc_timeout = req->ipc_timeout * HZ; -	server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; +	if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL, +					&server_conf.deadtime)) { +		ret = -EINVAL; +		goto out; +	}  	server_conf.share_fake_fscaps = req->share_fake_fscaps;  	ksmbd_init_domain(req->sub_auth); @@ -337,6 +341,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)  	server_conf.bind_interfaces_only = req->bind_interfaces_only;  	ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),  					req->ifc_list_sz); +out:  	if (ret) {  		pr_err("Server configuration error: %s %s %s\n",  		       req->netbios_name, req->server_string, diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7f38a3c3f5bd..abedf510899a 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -93,17 +93,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)  	return t;  } -static void free_transport(struct tcp_transport *t) +void ksmbd_free_transport(struct ksmbd_transport *kt)  { -	kernel_sock_shutdown(t->sock, SHUT_RDWR); -	sock_release(t->sock); -	t->sock = NULL; +	struct tcp_transport *t = TCP_TRANS(kt); -	ksmbd_conn_free(KSMBD_TRANS(t)->conn); +	sock_release(t->sock);  	kfree(t->iov);  	kfree(t);  } +static void free_transport(struct tcp_transport *t) +{ +	kernel_sock_shutdown(t->sock, SHUT_RDWR); +	ksmbd_conn_free(KSMBD_TRANS(t)->conn); +} +  /**   * kvec_array_init() - initialize a IO vector segment   * @new:	IO vector to be initialized diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h index 8c9aa624cfe3..1e51675ee1b2 100644 --- a/fs/smb/server/transport_tcp.h +++ b/fs/smb/server/transport_tcp.h @@ -8,6 +8,7 @@  int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz);  struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name); +void ksmbd_free_transport(struct ksmbd_transport *kt);  int ksmbd_tcp_init(void);  void ksmbd_tcp_destroy(void); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 8554aa5a1059..482eba0f4dc1 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -426,6 +426,13 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,  		goto out;  	} +	if (v_len <= *pos) { +		pr_err("stream write position %lld is out of bounds (stream length: %zd)\n", +				*pos, v_len); +		err = -EINVAL; +		goto out; +	} +  	if (v_len < size) {  		wbuf = kvzalloc(size, KSMBD_DEFAULT_GFP);  		if (!wbuf) { @@ -479,7 +486,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,  	int err = 0;  	if (work->conn->connection_type) { -		if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { +		if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) || +		    S_ISDIR(file_inode(fp->filp)->i_mode)) {  			pr_err("no right to write(%pD)\n", fp->filp);  			err = -EACCES;  			goto out; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 8d1f30dcba7e..dfed6fce8904 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file_table *ft,  		       bool (*skip)(struct ksmbd_tree_connect *tcon,  				    struct ksmbd_file *fp))  { -	unsigned int			id; -	struct ksmbd_file		*fp; -	int				num = 0; +	struct ksmbd_file *fp; +	unsigned int id = 0; +	int num = 0; + +	while (1) { +		write_lock(&ft->lock); +		fp = idr_get_next(ft->idr, &id); +		if (!fp) { +			write_unlock(&ft->lock); +			break; +		} -	idr_for_each_entry(ft->idr, fp, id) { -		if (skip(tcon, fp)) +		if (skip(tcon, fp) || +		    !atomic_dec_and_test(&fp->refcount)) { +			id++; +			write_unlock(&ft->lock);  			continue; +		}  		set_close_state_blocked_works(fp); +		idr_remove(ft->idr, fp->volatile_id); +		fp->volatile_id = KSMBD_NO_FID; +		write_unlock(&ft->lock); + +		down_write(&fp->f_ci->m_lock); +		list_del_init(&fp->node); +		up_write(&fp->f_ci->m_lock); -		if (!atomic_dec_and_test(&fp->refcount)) -			continue;  		__ksmbd_close_fd(ft, fp); +  		num++; +		id++;  	} +  	return num;  } @@ -713,12 +732,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,  static bool ksmbd_durable_scavenger_alive(void)  { -	mutex_lock(&durable_scavenger_lock); -	if (!durable_scavenger_running) { -		mutex_unlock(&durable_scavenger_lock); +	if (!durable_scavenger_running)  		return false; -	} -	mutex_unlock(&durable_scavenger_lock);  	if (kthread_should_stop())  		return false; @@ -799,9 +814,7 @@ static int ksmbd_durable_scavenger(void *dummy)  			break;  	} -	mutex_lock(&durable_scavenger_lock);  	durable_scavenger_running = false; -	mutex_unlock(&durable_scavenger_lock);  	module_put(THIS_MODULE); diff --git a/fs/splice.c b/fs/splice.c index 90d464241f15..4d6df083e0c0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -45,7 +45,7 @@   * here if set to avoid blocking other users of this pipe if splice is   * being done on it.   */ -static noinline void noinline pipe_clear_nowait(struct file *file) +static noinline void pipe_clear_nowait(struct file *file)  {  	fmode_t fmode = READ_ONCE(file->f_mode); diff --git a/fs/stat.c b/fs/stat.c index f13308bfdc98..3d9222807214 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -204,12 +204,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,  				  STATX_ATTR_DAX);  	idmap = mnt_idmap(path->mnt); -	if (inode->i_op->getattr) -		return inode->i_op->getattr(idmap, path, stat, -					    request_mask, -					    query_flags); +	if (inode->i_op->getattr) { +		int ret; + +		ret = inode->i_op->getattr(idmap, path, stat, request_mask, +				query_flags); +		if (ret) +			return ret; +	} else { +		generic_fillattr(idmap, request_mask, inode, stat); +	} + +	/* +	 * If this is a block device inode, override the filesystem attributes +	 * with the block device specific parameters that need to be obtained +	 * from the bdev backing inode. +	 */ +	if (S_ISBLK(stat->mode)) +		bdev_statx(path, stat, request_mask); -	generic_fillattr(idmap, request_mask, inode, stat);  	return 0;  }  EXPORT_SYMBOL(vfs_getattr_nosec); @@ -295,15 +308,6 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,  	if (path_mounted(path))  		stat->attributes |= STATX_ATTR_MOUNT_ROOT;  	stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - -	/* -	 * If this is a block device inode, override the filesystem -	 * attributes with the block device specific parameters that need to be -	 * obtained from the bdev backing inode. -	 */ -	if (S_ISBLK(stat->mode)) -		bdev_statx(path, stat, request_mask); -  	return 0;  } diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4f33a4a48886..b4071c9cf8c9 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode)  	}  	/* This inode entry is in-memory only and thus we don't have to mark  	 * the inode dirty */ -	if (ret == 0) +	if (ret >= 0)  		iinfo->i_lenExtents = inode->i_size;  	brelse(epos.bh);  } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d80f94346199..22f4bf956ba1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1585,8 +1585,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,  	user_uffdio_copy = (struct uffdio_copy __user *) arg;  	ret = -EAGAIN; -	if (atomic_read(&ctx->mmap_changing)) +	if (unlikely(atomic_read(&ctx->mmap_changing))) { +		if (unlikely(put_user(ret, &user_uffdio_copy->copy))) +			return -EFAULT;  		goto out; +	}  	ret = -EFAULT;  	if (copy_from_user(&uffdio_copy, user_uffdio_copy, @@ -1641,8 +1644,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,  	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;  	ret = -EAGAIN; -	if (atomic_read(&ctx->mmap_changing)) +	if (unlikely(atomic_read(&ctx->mmap_changing))) { +		if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) +			return -EFAULT;  		goto out; +	}  	ret = -EFAULT;  	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, @@ -1744,8 +1750,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)  	user_uffdio_continue = (struct uffdio_continue __user *)arg;  	ret = -EAGAIN; -	if (atomic_read(&ctx->mmap_changing)) +	if (unlikely(atomic_read(&ctx->mmap_changing))) { +		if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) +			return -EFAULT;  		goto out; +	}  	ret = -EFAULT;  	if (copy_from_user(&uffdio_continue, user_uffdio_continue, @@ -1801,8 +1810,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long  	user_uffdio_poison = (struct uffdio_poison __user *)arg;  	ret = -EAGAIN; -	if (atomic_read(&ctx->mmap_changing)) +	if (unlikely(atomic_read(&ctx->mmap_changing))) { +		if (unlikely(put_user(ret, &user_uffdio_poison->updated))) +			return -EFAULT;  		goto out; +	}  	ret = -EFAULT;  	if (copy_from_user(&uffdio_poison, user_uffdio_poison, @@ -1870,8 +1882,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,  	user_uffdio_move = (struct uffdio_move __user *) arg; -	if (atomic_read(&ctx->mmap_changing)) -		return -EAGAIN; +	ret = -EAGAIN; +	if (unlikely(atomic_read(&ctx->mmap_changing))) { +		if (unlikely(put_user(ret, &user_uffdio_move->move))) +			return -EFAULT; +		goto out; +	}  	if (copy_from_user(&uffdio_move, user_uffdio_move,  			   /* don't copy "move" last field */ diff --git a/fs/xattr.c b/fs/xattr.c index 02bee149ad96..8ec5b0204bfd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname,  		return error;  	filename = getname_maybe_null(pathname, at_flags); -	if (!filename) { +	if (!filename && dfd >= 0) {  		CLASS(fd, f)(dfd);  		if (fd_empty(f))  			error = -EBADF; @@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname,  		return error;  	filename = getname_maybe_null(pathname, at_flags); -	if (!filename) { +	if (!filename && dfd >= 0) {  		CLASS(fd, f)(dfd);  		if (fd_empty(f))  			return -EBADF; @@ -1428,6 +1428,15 @@ static bool xattr_is_trusted(const char *name)  	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);  } +static bool xattr_is_maclabel(const char *name) +{ +	const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + +	return !strncmp(name, XATTR_SECURITY_PREFIX, +			XATTR_SECURITY_PREFIX_LEN) && +		security_ismaclabel(suffix); +} +  /**   * simple_xattr_list - list all xattr objects   * @inode: inode from which to get the xattrs @@ -1460,6 +1469,17 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,  	if (err)  		return err; +	err = security_inode_listsecurity(inode, buffer, remaining_size); +	if (err < 0) +		return err; + +	if (buffer) { +		if (remaining_size < err) +			return -ERANGE; +		buffer += err; +	} +	remaining_size -= err; +  	read_lock(&xattrs->lock);  	for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {  		xattr = rb_entry(rbp, struct simple_xattr, rb_node); @@ -1468,6 +1488,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,  		if (!trusted && xattr_is_trusted(xattr->name))  			continue; +		/* skip MAC labels; these are provided by LSM above */ +		if (xattr_is_maclabel(xattr->name)) +			continue; +  		err = xattr_list_one(&buffer, &remaining_size, xattr->name);  		if (err)  			break; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fffd6fffdce0..ae0ca6858496 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -3,7 +3,7 @@ config XFS_FS  	tristate "XFS filesystem support"  	depends on BLOCK  	select EXPORTFS -	select LIBCRC32C +	select CRC32  	select FS_IOMAP  	help  	  XFS is a high performance journaling filesystem which originated diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e7f1b324b3b..1a2b3f06fa71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -105,6 +105,7 @@ xfs_buf_free(  {  	unsigned int		size = BBTOB(bp->b_length); +	might_sleep();  	trace_xfs_buf_free(bp, _RET_IP_);  	ASSERT(list_empty(&bp->b_lru)); diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index b4ffd80b7cb6..dcbfa274e06d 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -165,7 +165,7 @@ xmbuf_map_backing_mem(  	folio_set_dirty(folio);  	folio_unlock(folio); -	bp->b_addr = folio_address(folio); +	bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);  	return 0;  } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index edbc521870a1..b4e32f0860b7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(  	if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&  	    (lip->li_lsn == qlip->qli_flush_lsn ||  	     test_bit(XFS_LI_FAILED, &lip->li_flags))) { -  		spin_lock(&ailp->ail_lock); -		xfs_clear_li_failed(lip); +		clear_bit(XFS_LI_FAILED, &lip->li_flags);  		if (lip->li_lsn == qlip->qli_flush_lsn) {  			/* xfs_ail_update_finish() drops the AIL lock */  			tail_lsn = xfs_ail_delete_one(ailp, lip); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index a4bc1642fe56..414b27a86458 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(  	const struct xfs_fsmap		*keys,  	struct xfs_getfsmap_info	*info)  { +	struct xfs_fsmap		key0 = *keys; /* struct copy */  	struct xfs_mount		*mp = tp->t_mountp;  	struct xfs_rtgroup		*rtg = NULL;  	struct xfs_btree_cur		*bt_cur = NULL; @@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(  	int				error = 0;  	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); -	if (keys[0].fmr_physical >= eofs) +	if (key0.fmr_physical >= eofs)  		return 0; +	/* +	 * On zoned filesystems with an internal rt volume, the volume comes +	 * immediately after the end of the data volume.  However, the +	 * xfs_rtblock_t address space is relative to the start of the data +	 * device, which means that the first @rtstart fsblocks do not actually +	 * point anywhere.  If a fsmap query comes in with the low key starting +	 * below @rtstart, report it as "owned by filesystem". +	 */  	rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); -	if (keys[0].fmr_physical < rtstart_daddr) { +	if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {  		struct xfs_fsmap_irec		frec = {  			.owner			= XFS_RMAP_OWN_FS,  			.len_daddr		= rtstart_daddr,  		}; -		/* Adjust the low key if we are continuing from where we left off. */ -		if (keys[0].fmr_length > 0) { -			info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; -			return 0; +		/* +		 * Adjust the start of the query range if we're picking up from +		 * a previous round, and only emit the record if we haven't +		 * already gone past. +		 */ +		key0.fmr_physical += key0.fmr_length; +		if (key0.fmr_physical < rtstart_daddr) { +			error = xfs_getfsmap_helper(tp, info, &frec); +			if (error) +				return error; + +			key0.fmr_physical = rtstart_daddr;  		} -		/* Fabricate an rmap entry for space occupied by the data dev */ -		error = xfs_getfsmap_helper(tp, info, &frec); -		if (error) -			return error; +		/* Zero the other fields to avoid further adjustments. */ +		key0.fmr_owner = 0; +		key0.fmr_offset = 0; +		key0.fmr_length = 0;  	} -	start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); -	end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + -			min(eofs - 1, keys[1].fmr_physical)); - +	start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); +	end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));  	info->missing_owner = XFS_FMR_OWN_FREE;  	/* @@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(  	 * low to the fsmap low key and max out the high key to the end  	 * of the rtgroup.  	 */ -	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); -	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); +	info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); +	error = xfs_fsmap_owner_to_rmap(&info->low, &key0);  	if (error)  		return error; -	info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); -	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); +	info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); +	xfs_getfsmap_set_irec_flags(&info->low, &key0);  	/* Adjust the low key if we are continuing from where we left off. */  	if (info->low.rm_blockcount == 0) { diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 40fc1bf900af..c6cb0b6b9e46 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -1089,13 +1089,7 @@ xfs_iflush_abort(  	 * state. Whilst the inode is in the AIL, it should have a valid buffer  	 * pointer for push operations to access - it is only safe to remove the  	 * inode from the buffer once it has been removed from the AIL. -	 * -	 * We also clear the failed bit before removing the item from the AIL -	 * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer -	 * references the inode item owns and needs to hold until we've fully -	 * aborted the inode log item and detached it from the buffer.  	 */ -	clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);  	xfs_trans_ail_delete(&iip->ili_item, 0);  	/* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6493bdb57351..980aabc49512 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog(   *   *	1. the current iclog is active and has no data; the previous iclog   *		is in the active or dirty state. - *	2. the current iclog is drity, and the previous iclog is in the + *	2. the current iclog is dirty, and the previous iclog is in the   *		active or dirty state.   *   * We may sleep if: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 799b84220ebb..e5192c12e7ac 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -229,6 +229,7 @@ typedef struct xfs_mount {  	bool			m_finobt_nores; /* no per-AG finobt resv. */  	bool			m_update_sb;	/* sb needs update in mount */  	unsigned int		m_max_open_zones; +	unsigned int		m_zonegc_low_space;  	/*  	 * Bitsets of per-fs metadata that have been checked and/or are sick. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b2dd0c0bf509..4a11ddccc563 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1149,7 +1149,7 @@ xfs_init_percpu_counters(  	return 0;  free_freecounters: -	while (--i > 0) +	while (--i >= 0)  		percpu_counter_destroy(&mp->m_free[i].count);  	percpu_counter_destroy(&mp->m_delalloc_rtextents);  free_delalloc: @@ -2114,6 +2114,21 @@ xfs_fs_reconfigure(  	if (error)  		return error; +	/* attr2 -> noattr2 */ +	if (xfs_has_noattr2(new_mp)) { +		if (xfs_has_crc(mp)) { +			xfs_warn(mp, +			"attr2 is always enabled for a V5 filesystem - can't be changed."); +			return -EINVAL; +		} +		mp->m_features &= ~XFS_FEAT_ATTR2; +		mp->m_features |= XFS_FEAT_NOATTR2; +	} else if (xfs_has_attr2(new_mp)) { +		/* noattr2 -> attr2 */ +		mp->m_features &= ~XFS_FEAT_NOATTR2; +		mp->m_features |= XFS_FEAT_ATTR2; +	} +  	/* inode32 -> inode64 */  	if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {  		mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -2126,6 +2141,17 @@ xfs_fs_reconfigure(  		mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);  	} +	/* +	 * Now that mp has been modified according to the remount options, we +	 * do a final option validation with xfs_finish_flags() just like it is +	 * just like it is done during mount. We cannot use +	 * done during mount. We cannot use xfs_finish_flags() on new_mp as it +	 * contains only the user given options. +	 */ +	error = xfs_finish_flags(mp); +	if (error) +		return error; +  	/* ro -> rw */  	if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {  		error = xfs_remount_rw(mp); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index b7e82d85f043..7a5c5ef2db92 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -718,8 +718,40 @@ max_open_zones_show(  }  XFS_SYSFS_ATTR_RO(max_open_zones); +static ssize_t +zonegc_low_space_store( +	struct kobject		*kobj, +	const char		*buf, +	size_t			count) +{ +	int			ret; +	unsigned int		val; + +	ret = kstrtouint(buf, 0, &val); +	if (ret) +		return ret; + +	if (val > 100) +		return -EINVAL; + +	zoned_to_mp(kobj)->m_zonegc_low_space = val; + +	return count; +} + +static ssize_t +zonegc_low_space_show( +	struct kobject		*kobj, +	char			*buf) +{ +	return sysfs_emit(buf, "%u\n", +			zoned_to_mp(kobj)->m_zonegc_low_space); +} +XFS_SYSFS_ATTR_RW(zonegc_low_space); +  static struct attribute *xfs_zoned_attrs[] = {  	ATTR_LIST(max_open_zones), +	ATTR_LIST(zonegc_low_space),  	NULL,  };  ATTRIBUTE_GROUPS(xfs_zoned); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fcb1828e598..67c328d23e4a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -315,7 +315,7 @@ xfs_ail_splice(  }  /* - * Delete the given item from the AIL.  Return a pointer to the item. + * Delete the given item from the AIL.   */  static void  xfs_ail_delete( @@ -777,26 +777,28 @@ xfs_ail_update_finish(  }  /* - * xfs_trans_ail_update - bulk AIL insertion operation. + * xfs_trans_ail_update_bulk - bulk AIL insertion operation.   * - * @xfs_trans_ail_update takes an array of log items that all need to be + * @xfs_trans_ail_update_bulk takes an array of log items that all need to be   * positioned at the same LSN in the AIL. If an item is not in the AIL, it will - * be added.  Otherwise, it will be repositioned  by removing it and re-adding - * it to the AIL. If we move the first item in the AIL, update the log tail to - * match the new minimum LSN in the AIL. + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL.   * - * This function takes the AIL lock once to execute the update operations on - * all the items in the array, and as such should not be called with the AIL - * lock held. As a result, once we have the AIL lock, we need to check each log - * item LSN to confirm it needs to be moved forward in the AIL. + * If we move the first item in the AIL, update the log tail to match the new + * minimum LSN in the AIL.   * - * To optimise the insert operation, we delete all the items from the AIL in - * the first pass, moving them into a temporary list, then splice the temporary - * list into the correct position in the AIL. This avoids needing to do an - * insert operation on every item. + * This function should be called with the AIL lock held.   * - * This function must be called with the AIL lock held.  The lock is dropped - * before returning. + * To optimise the insert operation, we add all items to a temporary list, then + * splice this list into the correct position in the AIL. + * + * Items that are already in the AIL are first deleted from their current + * location before being added to the temporary list. + * + * This avoids needing to do an insert operation on every item. + * + * The AIL lock is dropped by xfs_ail_update_finish() before returning to + * the caller.   */  void  xfs_trans_ail_update_bulk( @@ -909,10 +911,9 @@ xfs_trans_ail_delete(  		return;  	} -	/* xfs_ail_update_finish() drops the AIL lock */ -	xfs_clear_li_failed(lip); +	clear_bit(XFS_LI_FAILED, &lip->li_flags);  	tail_lsn = xfs_ail_delete_one(ailp, lip); -	xfs_ail_update_finish(ailp, tail_lsn); +	xfs_ail_update_finish(ailp, tail_lsn);	/* drops the AIL lock */  }  int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd841df93021..f945f0450b16 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(  }  #endif -static inline void -xfs_clear_li_failed( -	struct xfs_log_item	*lip) -{ -	struct xfs_buf	*bp = lip->li_buf; - -	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); -	lockdep_assert_held(&lip->li_ailp->ail_lock); - -	if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { -		lip->li_buf = NULL; -		xfs_buf_rele(bp); -	} -} - -static inline void -xfs_set_li_failed( -	struct xfs_log_item	*lip, -	struct xfs_buf		*bp) -{ -	lockdep_assert_held(&lip->li_ailp->ail_lock); - -	if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { -		xfs_buf_hold(bp); -		lip->li_buf = bp; -	} -} -  #endif	/* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 52af234936a2..d509e49b2aaa 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1201,6 +1201,13 @@ xfs_mount_zones(  	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,  			iz.available + iz.reclaimable); +	/* +	 * The user may configure GC to free up a percentage of unused blocks. +	 * By default this is 0. GC will always trigger at the minimum level +	 * for keeping max_open_zones available for data placement. +	 */ +	mp->m_zonegc_low_space = 0; +  	error = xfs_zone_gc_mount(mp);  	if (error)  		goto out_free_zone_info; diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index c5136ea9bb1d..d613a4094db6 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -162,18 +162,36 @@ struct xfs_zone_gc_data {  /*   * We aim to keep enough zones free in stock to fully use the open zone limit - * for data placement purposes. + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing.   */  bool  xfs_zoned_need_gc(  	struct xfs_mount	*mp)  { +	s64			available, free, threshold; +	s32			remainder; +  	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))  		return false; -	if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + +	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + +	if (available <  	    mp->m_groups[XG_TYPE_RTG].blocks *  	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))  		return true; + +	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + +	threshold = div_s64_rem(free, 100, &remainder); +	threshold = threshold * mp->m_zonegc_low_space + +		    remainder * div_s64(mp->m_zonegc_low_space, 100); + +	if (available < threshold) +		return true; +  	return false;  } @@ -789,7 +807,8 @@ xfs_zone_gc_write_chunk(  {  	struct xfs_zone_gc_data	*data = chunk->data;  	struct xfs_mount	*mp = chunk->ip->i_mount; -	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset; +	phys_addr_t		bvec_paddr = +		bvec_phys(bio_first_bvec_all(&chunk->bio));  	struct xfs_gc_bio	*split_chunk;  	if (chunk->bio.bi_status) @@ -804,7 +823,7 @@ xfs_zone_gc_write_chunk(  	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);  	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, -			folio_offset); +			offset_in_folio(chunk->scratch->folio, bvec_paddr));  	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))  		xfs_zone_gc_submit_write(data, split_chunk); | 
