diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-28 11:22:56 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-28 11:22:56 -0700 |
commit | 7879d7aff0ffd969fcb1a59e3f87ebb353e47b7f (patch) | |
tree | 68885c284e839620d1d0366fde8a13367161c600 | |
parent | 794cbac9c053155754d04231b9365f91ea4ce7d2 (diff) | |
parent | 4e8fc4f7208b032674ef8a4977b96484c328515c (diff) |
Merge tag 'vfs-6.17-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull misc VFS updates from Christian Brauner:
"This contains the usual selections of misc updates for this cycle.
Features:
- Add ext4 IOCB_DONTCACHE support
This refactors the address_space_operations write_begin() and
write_end() callbacks to take const struct kiocb * as their first
argument, allowing IOCB flags such as IOCB_DONTCACHE to propagate
to the filesystem's buffered I/O path.
Ext4 is updated to implement handling of the IOCB_DONTCACHE flag
and advertises support via the FOP_DONTCACHE file operation flag.
Additionally, the i915 driver's shmem write paths are updated to
bypass the legacy write_begin/write_end interface in favor of
directly calling write_iter() with a constructed synchronous kiocb.
Another i915 change replaces a manual write loop with
kernel_write() during GEM shmem object creation.
Cleanups:
- don't duplicate vfs_open() in kernel_file_open()
- proc_fd_getattr(): don't bother with S_ISDIR() check
- fs/ecryptfs: replace snprintf with sysfs_emit in show function
- vfs: Remove unnecessary list_for_each_entry_safe() from
evict_inodes()
- filelock: add new locks_wake_up_waiter() helper
- fs: Remove three arguments from block_write_end()
- VFS: change old_dir and new_dir in struct renamedata to dentrys
- netfs: Remove unused declaration netfs_queue_write_request()
Fixes:
- eventpoll: Fix semi-unbounded recursion
- eventpoll: fix sphinx documentation build warning
- fs/read_write: Fix spelling typo
- fs: annotate data race between poll_schedule_timeout() and
pollwake()
- fs/pipe: set FMODE_NOWAIT in create_pipe_files()
- docs/vfs: update references to i_mutex to i_rwsem
- fs/buffer: remove comment about hard sectorsize
- fs/buffer: remove the min and max limit checks in __getblk_slow()
- fs/libfs: don't assume blocksize <= PAGE_SIZE in
generic_check_addressable
- fs_context: fix parameter name in infofc() macro
- fs: Prevent file descriptor table allocations exceeding INT_MAX"
* tag 'vfs-6.17-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (24 commits)
netfs: Remove unused declaration netfs_queue_write_request()
eventpoll: fix sphinx documentation build warning
ext4: support uncached buffered I/O
mm/pagemap: add write_begin_get_folio() helper function
fs: change write_begin/write_end interface to take struct kiocb *
drm/i915: Refactor shmem_pwrite() to use kiocb and write_iter
drm/i915: Use kernel_write() in shmem object create
eventpoll: Fix semi-unbounded recursion
vfs: Remove unnecessary list_for_each_entry_safe() from evict_inodes()
fs/libfs: don't assume blocksize <= PAGE_SIZE in generic_check_addressable
fs/buffer: remove the min and max limit checks in __getblk_slow()
fs: Prevent file descriptor table allocations exceeding INT_MAX
fs: Remove three arguments from block_write_end()
fs/ecryptfs: replace snprintf with sysfs_emit in show function
fs: annotate suspected data race between poll_schedule_timeout() and pollwake()
docs/vfs: update references to i_mutex to i_rwsem
fs/buffer: remove comment about hard sectorsize
fs_context: fix parameter name in infofc() macro
VFS: change old_dir and new_dir in struct renamedata to dentrys
proc_fd_getattr(): don't bother with S_ISDIR() check
...
88 files changed, 520 insertions, 457 deletions
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 2e567e341c3bd..580581281ed7a 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -253,10 +253,10 @@ prototypes:: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index fd32a9a17bfb3..57604b07bdc99 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -758,8 +758,9 @@ process is more complicated and uses write_begin/write_end or dirty_folio to write data into the address_space, and writepages to writeback data to storage. -Adding and removing pages to/from an address_space is protected by the -inode's i_mutex. +Removing pages from an address_space requires holding the inode's i_rwsem +exclusively, while adding pages to the address_space requires holding the +inode's i_mapping->invalidate_lock exclusively. When data is written to a page, the PG_Dirty flag should be set. It typically remains set until writepages asks for it to be written. This @@ -822,10 +823,10 @@ cache in your filesystem. The following members are defined: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + struct page **pagep, void **fsdata); + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); diff --git a/block/fops.c b/block/fops.c index 1309861d4c2c4..f34e7315c83cd 100644 --- a/block/fops.c +++ b/block/fops.c @@ -496,18 +496,21 @@ static void blkdev_readahead(struct readahead_control *rac) mpage_readahead(rac, blkdev_get_block); } -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int blkdev_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int blkdev_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 19a3eb82dc6a6..9cbb0f68a5bb9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -6,6 +6,7 @@ #include <linux/pagevec.h> #include <linux/shmem_fs.h> #include <linux/swap.h> +#include <linux/uio.h> #include <drm/drm_cache.h> @@ -400,12 +401,12 @@ static int shmem_pwrite(struct drm_i915_gem_object *obj, const struct drm_i915_gem_pwrite *arg) { - struct address_space *mapping = obj->base.filp->f_mapping; - const struct address_space_operations *aops = mapping->a_ops; char __user *user_data = u64_to_user_ptr(arg->data_ptr); - u64 remain; - loff_t pos; - unsigned int pg; + struct file *file = obj->base.filp; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t written; + u64 size = arg->size; /* Caller already validated user args */ GEM_BUG_ON(!access_ok(user_data, arg->size)); @@ -428,63 +429,24 @@ shmem_pwrite(struct drm_i915_gem_object *obj, if (obj->mm.madv != I915_MADV_WILLNEED) return -EFAULT; - /* - * Before the pages are instantiated the object is treated as being - * in the CPU domain. The pages will be clflushed as required before - * use, and we can freely write into the pages directly. If userspace - * races pwrite with any other operation; corruption will ensue - - * that is userspace's prerogative! - */ + if (size > MAX_RW_COUNT) + return -EFBIG; - remain = arg->size; - pos = arg->offset; - pg = offset_in_page(pos); + if (!file->f_op->write_iter) + return -EINVAL; - do { - unsigned int len, unwritten; - struct folio *folio; - void *data, *vaddr; - int err; - char __maybe_unused c; - - len = PAGE_SIZE - pg; - if (len > remain) - len = remain; - - /* Prefault the user page to reduce potential recursion */ - err = __get_user(c, user_data); - if (err) - return err; - - err = __get_user(c, user_data + len - 1); - if (err) - return err; - - err = aops->write_begin(obj->base.filp, mapping, pos, len, - &folio, &data); - if (err < 0) - return err; - - vaddr = kmap_local_folio(folio, offset_in_folio(folio, pos)); - pagefault_disable(); - unwritten = __copy_from_user_inatomic(vaddr, user_data, len); - pagefault_enable(); - kunmap_local(vaddr); - - err = aops->write_end(obj->base.filp, mapping, pos, len, - len - unwritten, folio, data); - if (err < 0) - return err; - - /* We don't handle -EFAULT, leave it to the caller to check */ - if (unwritten) - return -ENODEV; - - remain -= len; - user_data += len; - pos += len; - pg = 0; - } while (remain); + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = arg->offset; + iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)user_data, size); + + written = file->f_op->write_iter(&kiocb, &iter); + BUG_ON(written == -EIOCBQUEUED); + + if (written != size) + return -EIO; + + if (written < 0) + return written; return 0; } @@ -637,9 +599,8 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, { struct drm_i915_gem_object *obj; struct file *file; - const struct address_space_operations *aops; - loff_t pos; - int err; + loff_t pos = 0; + ssize_t err; GEM_WARN_ON(IS_DGFX(i915)); obj = i915_gem_object_create_shmem(i915, round_up(size, PAGE_SIZE)); @@ -649,29 +610,15 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU); file = obj->base.filp; - aops = file->f_mapping->a_ops; - pos = 0; - do { - unsigned int len = min_t(typeof(size), size, PAGE_SIZE); - struct folio *folio; - void *fsdata; - - err = aops->write_begin(file, file->f_mapping, pos, len, - &folio, &fsdata); - if (err < 0) - goto fail; + err = kernel_write(file, data, size, &pos); - memcpy_to_folio(folio, offset_in_folio(folio, pos), data, len); + if (err < 0) + goto fail; - err = aops->write_end(file, file->f_mapping, pos, len, len, - folio, fsdata); - if (err < 0) - goto fail; - - size -= len; - data += len; - pos += len; - } while (size); + if (err != size) { + err = -EIO; + goto fail; + } return obj; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 21527189e4307..6830f8bc8d4eb 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int adfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int adfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/affs/file.c b/fs/affs/file.c index 7a71018e3f675..219ea03539060 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -static int affs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int affs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int affs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, +static int affs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) return err; } -static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, +static int affs_write_begin_ofs(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return err; } -static int affs_write_end_ofs(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int affs_write_end_ofs(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; diff --git a/fs/attr.c b/fs/attr.c index 9caf63d20d03e..5425c1dbbff92 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare); * @inode: the inode to be truncated * @offset: the new size to assign to the inode * - * inode_newsize_ok must be called with i_mutex held. + * inode_newsize_ok must be called with i_rwsem held exclusively. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ @@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) * @inode: the inode to be updated * @attr: the new attributes * - * setattr_copy must be called with i_mutex held. + * setattr_copy must be called with i_rwsem held exclusively. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine @@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr); * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * - * The caller must hold the i_mutex on the affected object. + * The caller must hold the i_rwsem exclusively on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - /* Flag setting protected by i_mutex */ + /* Flag setting protected by i_rwsem */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 66bacdd49f789..1c54b9b5bd695 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -674,7 +674,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc /* buffered writes: */ -int bch2_write_begin(struct file *file, struct address_space *mapping, +int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -757,7 +757,7 @@ err_unlock: return bch2_err_class(ret); } -int bch2_write_end(struct file *file, struct address_space *mapping, +int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index 3207ebbb4ab43..14de91c276560 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t pos, +int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, unsigned len, struct folio **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, +int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index fa66a09e496a8..10dc0151ea550 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int bfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int bfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/buffer.c b/fs/buffer.c index 8cf4a1dc481eb..ead4dc85debdd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1122,15 +1122,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, { bool blocking = gfpflags_allow_blocking(gfp); - /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_logical_block_size(bdev)-1) || - (size < 512 || size > PAGE_SIZE))) { - printk(KERN_ERR "getblk(): invalid block size %d requested\n", - size); - printk(KERN_ERR "logical block size: %d\n", - bdev_logical_block_size(bdev)); - - dump_stack(); + if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) { + printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n", + size, bdev_logical_block_size(bdev)); return NULL; } @@ -2271,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, } EXPORT_SYMBOL(block_write_begin); -int block_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int block_write_end(loff_t pos, unsigned len, unsigned copied, + struct folio *folio) { size_t start = pos - folio_pos(folio); @@ -2304,15 +2297,15 @@ int block_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(block_write_end); -int generic_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int generic_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * No need to use i_size_read() here, the i_size cannot change under us @@ -2501,7 +2494,8 @@ out: } EXPORT_SYMBOL(generic_cont_expand_simple); -static int cont_expand_zero(struct file *file, struct address_space *mapping, +static int cont_expand_zero(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2525,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2558,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2578,17 +2572,16 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata, - get_block_t *get_block, loff_t *bytes) +int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; unsigned int blocksize = i_blocksize(inode); unsigned int zerofrom; int err; - err = cont_expand_zero(file, mapping, pos, bytes); + err = cont_expand_zero(iocb, mapping, pos, bytes); if (err) return err; @@ -2610,7 +2603,7 @@ EXPORT_SYMBOL(cont_write_begin); * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * - * We are not allowed to take the i_mutex here so we have to play games to + * We are not allowed to take the i_rwsem here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index aecfc5c37b49a..91dfd02318772 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -388,10 +388,10 @@ try_again: } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(dir), + .old_parent = dir, .old_dentry = rep, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(cache->graveyard), + .new_parent = cache->graveyard, .new_dentry = grave, }; trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 60a621b00c656..02468c848cce2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1864,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ -static int ceph_write_begin(struct file *file, struct address_space *mapping, +static int ceph_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int r; @@ -1885,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, * we don't do anything in here that simple_write_end doesn't do * except adjust dirty page accounting */ -static int ceph_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ceph_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; diff --git a/fs/dcache.c b/fs/dcache.c index 2adac023ba23f..60046ae23d514 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2797,10 +2797,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target) * @target: new dentry * @exchange: exchange the two dentries * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller must hold - * rename_lock, the i_mutex of the source and target directories, - * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + * Update the dcache to reflect the move of a file name. Negative dcache + * entries should not be moved in this way. Caller must hold rename_lock, the + * i_rwsem of the source and target directories (exclusively), and the sb-> + * s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) @@ -2946,7 +2946,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, and rename_lock + * dentry->d_parent->d_inode->i_rwsem, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... diff --git a/fs/direct-io.c b/fs/direct-io.c index bbd05f1a21453..1694ee9a93820 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio) * The locking rules are governed by the flags parameter: * - if the flags value contains DIO_LOCKING we use a fancy locking * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is + * For writes this function is called under i_rwsem and returns with + * i_rwsem held, for reads, i_rwsem is not held on entry, but it is * taken and dropped again before returning. * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize @@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) * counter before starting direct I/O, and decrement it once we are done. * Truncate can wait for it to reach zero to provide exclusion. It is * expected that filesystem provide exclusion between new direct I/O - * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem, * but other filesystems need to take care of this on their own. * * NOTE: if you pass "sdio" to anything by pointer make sure that function @@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose + * we can let i_rwsem go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 493d7f1949561..bd317d943d62e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -635,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } rd.old_mnt_idmap = &nop_mnt_idmap; - rd.old_dir = d_inode(lower_old_dir_dentry); + rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; rd.new_mnt_idmap = &nop_mnt_idmap; - rd.new_dir = d_inode(lower_new_dir_dentry); + rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 45f9ca4465da8..eab1beb846d38 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -20,6 +20,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs_stack.h> +#include <linux/sysfs.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" @@ -764,7 +765,7 @@ static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { - return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); + return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 60f0ac8744b50..2c2b12fedeaec 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -228,7 +228,7 @@ out: /** * ecryptfs_write_begin - * @file: The eCryptfs file + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write @@ -239,7 +239,7 @@ out: * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_write_begin(struct file *file, +static int ecryptfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file, * Note, this will increase i_size. */ if (index != 0) { if (prev_page_end_size > i_size_read(mapping->host)) { - rc = ecryptfs_truncate(file->f_path.dentry, + rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry, prev_page_end_size); if (rc) { printk(KERN_ERR "%s: Error on attempt to " @@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) /** * ecryptfs_write_end - * @file: The eCryptfs file object + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file position * @len: The length of the data (unused) @@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ -static int ecryptfs_write_end(struct file *file, +static int ecryptfs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0fbf5dfedb24e..b22d6f819f782 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,6 +218,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; + u8 loop_check_depth; /* * usage count, used together with epitem->dying to @@ -2140,23 +2141,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } /** - * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure does not violate the constraints, in - * terms of closed loops, or too deep chains (which can - * result in excessive stack usage). + * ep_loop_check_proc - verify that adding an epoll file @ep inside another + * epoll file does not create closed loops, and + * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: %zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or %-1 otherwise. + * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { - int error = 0; + int result = 0; struct rb_node *rbp; struct epitem *epi; + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -2164,13 +2166,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->gen == loop_check_gen) - continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - error = -1; + result = INT_MAX; else - error = ep_loop_check_proc(ep_tovisit, depth + 1); - if (error != 0) + result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + if (result > EP_MAX_NESTS) break; } else { /* @@ -2184,9 +2184,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) list_file(epi->ffd.file); } } + ep->loop_check_depth = result; mutex_unlock(&ep->mtx); - return error; + return result; +} + +/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ +static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) +{ + int result = 0; + struct epitem *epi; + + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + hlist_for_each_entry_rcu(epi, &ep->refs, fllink) + result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); + ep->gen = loop_check_gen; + ep->loop_check_depth = result; + return result; } /** @@ -2202,8 +2218,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { + int depth, upwards_depth; + inserting_into = ep; - return ep_loop_check_proc(to, 0); + /* + * Check how deep down we can get from @to, and whether it is possible + * to loop up to @ep. + */ + depth = ep_loop_check_proc(to, 0); + if (depth > EP_MAX_NESTS) + return -1; + /* Check how far up we can go from @ep. */ + rcu_read_lock(); + upwards_depth = ep_get_upwards_depth_proc(ep, 0); + rcu_read_unlock(); + + return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 841a5b18e3dfd..70f53edd0a104 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return blkdev_issue_flush(inode->i_sb->s_bdev); } -static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) +static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size) { int err; loff_t pos; - struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *ops = mapping->a_ops; @@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (pos + len > new_valid_size) len = new_valid_size - pos; - err = ops->write_begin(file, mapping, pos, len, &folio, NULL); + err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; off = offset_in_folio(folio, pos); folio_zero_new_buffers(folio, off, off + len); - err = ops->write_end(file, mapping, pos, len, len, folio, NULL); + err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; @@ -604,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) } if (pos > valid_size) { - ret = exfat_extend_valid_size(file, pos); + ret = exfat_extend_valid_size(inode, pos); if (ret < 0 && ret != -ENOSPC) { exfat_err(inode->i_sb, "write: fail to zero from %llu to %llu(%zd)", @@ -665,7 +664,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) start + vma->vm_end - vma->vm_start); if (ei->valid_size < end) { - err = exfat_extend_valid_size(file, end); + err = exfat_extend_valid_size(inode, end); if (err < 0) { inode_unlock(inode); return vmf_fs_error(err); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index b22c02d6000f7..c10844e1e16c3 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -446,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) } } -static int exfat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, - struct folio **foliop, void **fsdata) +static int exfat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + struct folio **foliop, void **fsdata) { int ret; @@ -463,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int exfat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct folio *folio, void *fsdata) +static int exfat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) exfat_write_failed(mapping, pos+len); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 402fecf90a44b..b07b3b369710c 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 30f8201c155f4..d35ca26eee3ce 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -915,7 +915,7 @@ static void ext2_readahead(struct readahead_control *rac) } static int -ext2_write_begin(struct file *file, struct address_space *mapping, +ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; @@ -926,13 +926,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ext2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ext2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21df81347147c..274b41a476c8f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -977,7 +977,8 @@ const struct file_operations ext4_file_operations = { .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_DIO_PARALLEL_WRITE, + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index be9a4cba35fd5..5c7024051f1ed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1252,7 +1252,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ -static int ext4_write_begin(struct file *file, struct address_space *mapping, +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -1263,7 +1264,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1287,16 +1287,14 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * __filemap_get_folio() can take a long time if the + * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -1400,12 +1398,12 @@ static int write_end_fn(handle_t *handle, struct inode *inode, /* * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). + * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ -static int ext4_write_end(struct file *file, +static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1424,7 +1422,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1510,7 +1508,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, } while (bh != head); } -static int ext4_journalled_write_end(struct file *file, +static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3036,7 +3034,8 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -3044,7 +3043,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -3054,7 +3052,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, + return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; @@ -3070,9 +3068,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -3144,8 +3140,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, - folio, NULL); + copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* @@ -3196,7 +3191,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, return copied; } -static int ext4_da_write_end(struct file *file, +static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3205,7 +3200,7 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) - return ext4_write_end(file, mapping, pos, + return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31e8928426259..711ad80b38d05 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3519,8 +3519,10 @@ reserve_block: return 0; } -static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int f2fs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3656,7 +3658,7 @@ fail: return err; } -static int f2fs_write_end(struct file *file, +static int f2fs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3852bb66358cc..9648ed0978168 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } -static int fat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int fat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int err; - err = cont_write_begin(file, mapping, pos, len, + err = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) @@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, return err; } -static int fat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int fat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/file.c b/fs/file.c index b6db031545e65..6d2275c3be9c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -197,6 +197,21 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) return ERR_PTR(-EMFILE); } + /* + * Check if the allocation size would exceed INT_MAX. kvmalloc_array() + * and kvmalloc() will warn if the allocation size is greater than + * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. + * + * This can happen when sysctl_nr_open is set to a very high value and + * a process tries to use a file descriptor near that limit. For example, + * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what + * systemd typically sets it to - then trying to use a file descriptor + * close to that value will require allocating a file descriptor table + * that exceeds 8GB in size. + */ + if (unlikely(nr > INT_MAX / sizeof(struct file *))) + return ERR_PTR(-EMFILE); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 47006d0753f1c..2ddfb3bb64836 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2212,10 +2212,13 @@ out: * It's worthy to make sure that space is reserved on disk for the write, * but how to implement it without killing performance need more thinking. */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int fuse_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; + struct file *file = iocb->ki_filp; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); struct folio *folio; loff_t fsize; @@ -2255,9 +2258,10 @@ error: return err; } -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int fuse_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index a0c7cb0f79fcc..c3fd3172fdd60 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -201,7 +201,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a81ce7a740b91..096f338134f93 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } } -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 2f089bff0095c..3d5c65aef3b23 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata); +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f331e95742178..97d75bb2c388e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } } -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1f512f8ea757d..2ac60d5d7314a 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -445,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) return ret; } -static int hostfs_write_begin(struct file *file, struct address_space *mapping, +static int hostfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -458,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int hostfs_write_end(struct file *file, struct address_space *mapping, +static int hostfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -468,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, int err; buffer = kmap_local_folio(folio, from); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); + err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied); kunmap_local(buffer); if (!folio_test_uptodate(folio) && err == folio_size(folio)) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 449a3fc1b8d90..7b95a3d2e2a6d 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) hpfs_unlock(inode->i_sb); } -static int hpfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int hpfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) @@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int hpfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hpfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b7994186fc665..9ddd67da0eeb5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -311,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval; } -static int hugetlbfs_write_begin(struct file *file, +static int hugetlbfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -319,9 +319,10 @@ static int hugetlbfs_write_begin(struct file *file, return -EINVAL; } -static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hugetlbfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; diff --git a/fs/inode.c b/fs/inode.c index 99318b157a9a1..01ebdc40021e2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -865,12 +865,12 @@ static void dispose_list(struct list_head *head) */ void evict_inodes(struct super_block *sb) { - struct inode *inode, *next; + struct inode *inode; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -1158,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* - * ensure nobody is actually holding i_mutex + * ensure nobody is actually holding i_rwsem */ - // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); @@ -2615,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished); * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references - * to i_dio_count, usually by inode->i_mutex. + * to i_dio_count, usually by inode->i_rwsem. */ void inode_dio_wait(struct inode *inode) { @@ -2633,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * - * Note: the caller should be holding i_mutex, or else be sure that + * Note: the caller should be holding i_rwsem exclusively, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify @@ -2641,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * - * In the long run, i_mutex is overkill, and we should probably look + * In the long run, i_rwsem is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fb4519158f3a7..8f29be98a46ee 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -926,8 +926,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; - bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, folio, NULL); + bh_written = block_write_end(pos, len, copied, folio); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 13c18ccc13b0a..adec3af9bf8d1 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -21,12 +21,14 @@ #include <linux/jffs2.h> #include "nodelist.h" -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata); -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata); +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata); +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) return ret; } -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct folio *folio; struct inode *inode = mapping->host; @@ -235,9 +238,10 @@ out_err: return ret; } -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 60fc92dee24d2..083e7fa54709d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) } } -static int jfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int jfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/libfs.c b/fs/libfs.c index 89e47af931f1b..67bd8eea4af14 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -921,7 +921,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) return 0; } -int simple_write_begin(struct file *file, struct address_space *mapping, +int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -946,7 +946,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @file: See .write_end of address_space_operations + * @iocb: kernel I/O control block * @mapping: " * @pos: " * @len: " @@ -957,7 +957,8 @@ EXPORT_SYMBOL(simple_write_begin); * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for - * FSes that don't need any other processing. i_mutex is assumed to be held. + * FSes that don't need any other processing. i_rwsem is assumed to be held + * exclusively. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode @@ -966,9 +967,10 @@ EXPORT_SYMBOL(simple_write_begin); * * Use *ONLY* with simple_read_folio() */ -static int simple_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int simple_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -984,7 +986,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping, } /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. + * cannot change under us because we hold the i_rwsem. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); @@ -1594,13 +1596,17 @@ EXPORT_SYMBOL(generic_file_fsync); int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; - u64 last_fs_page = - last_fs_block >> (PAGE_SHIFT - blocksize_bits); + u64 last_fs_page, max_bytes; + + if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes)) + return -EFBIG; + + last_fs_page = (max_bytes >> PAGE_SHIFT) - 1; if (unlikely(num_blocks == 0)) return 0; - if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) + if (blocksize_bits < 9) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || diff --git a/fs/locks.c b/fs/locks.c index 1619cddfa7a4d..559f02aa41722 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker) fl->fl_lmops && fl->fl_lmops->lm_notify) fl->fl_lmops->lm_notify(fl); else - locks_wake_up(fl); + locks_wake_up_waiter(waiter); /* * The setting of flc_blocker to NULL marks the "done" @@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr /* * In the delegation case we need mutual exclusion with - * a number of operations that take the i_mutex. We trylock + * a number of operations that take the i_rwsem. We trylock * because delegations are an optional optimization, and if * there's some chance of a conflict--we'd rather not * bother, maybe that's a sign this just isn't a good file to diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dd2a425b41f04..19052fc47e9ea 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f007e389d5d29..df9d11479caf1 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) } } -static int minix_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int minix_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/namei.c b/fs/namei.c index c26a7ee421849..ae95bea02bbc0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1469,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { - /* Allow the filesystem to manage the transit without i_mutex + /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); @@ -2946,7 +2946,7 @@ EXPORT_SYMBOL(try_lookup_noperm); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { @@ -2972,7 +2972,7 @@ EXPORT_SYMBOL(lookup_noperm); * * This can be used for in-kernel filesystem clients such as file servers. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) @@ -4551,13 +4551,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * - * The caller must hold dir->i_mutex. + * The caller must hold dir->i_rwsem exclusively. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop - * dir->i_mutex before doing so. + * dir->i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4616,7 +4616,7 @@ EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_mutex. Truncate can take a long time if there is a lot of + * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ @@ -4794,13 +4794,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * - * The caller must hold dir->i_mutex + * The caller must hold dir->i_rwsem exclusively. * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4996,7 +4996,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). - * And that - after we got ->i_mutex on parents (until then we don't know + * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we @@ -5008,15 +5008,16 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_mutex + * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truly excessive + * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; - struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct inode *old_dir = d_inode(rd->old_parent); + struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; @@ -5275,10 +5276,10 @@ retry_deleg: if (error) goto exit5; - rd.old_dir = old_path.dentry->d_inode; + rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_dir = new_path.dentry->d_inode; + rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; diff --git a/fs/namespace.c b/fs/namespace.c index c549bd39c210a..87c0061302051 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2022,7 +2022,7 @@ out: * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 033feeab8c346..2bd557ca1af95 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -342,12 +342,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; int ret; @@ -377,10 +379,12 @@ start: return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); unsigned offset = offset_in_folio(folio, pos); int status; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ee78b6fb17098..98ab55ba3ced7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1867,7 +1867,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; - struct inode *fdir, *tdir; int type = S_IFDIR; __be32 err; int host_err; @@ -1883,10 +1882,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1947,10 +1944,10 @@ retry: } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = fdir, + .old_parent = fdentry, .old_dentry = odentry, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = tdir, + .new_parent = tdentry, .new_dentry = ndentry, }; int retries; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 9b7f8e9655a27..6ca3d74be1e16 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(folio, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); + copied = block_write_end(pos, len, len, folio); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5cf7328d53605..87ddde159f0c9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) } } -static int nilfs_write_begin(struct file *file, struct address_space *mapping, +static int nilfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, return err; } -static int nilfs_write_end(struct file *file, struct address_space *mapping, +static int nilfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, nr_dirty = nilfs_page_count_clean_buffers(folio, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, folio, + copied = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 22aecf6e23445..a9c61d0492cbd 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, if (unlikely(err)) goto failed_folio; - block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, folio, NULL); + block_write_end(pos, blocksize, blocksize, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 2e321b84a1ed9..c12ae1fb8b370 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -162,13 +162,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); + err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom); - err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); + err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 4f9020df8912d..37cbbee7fa580 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -920,7 +920,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, bh_result, create, GET_BLOCK_WRITE_BEGIN); } -int ntfs_write_begin(struct file *file, struct address_space *mapping, +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; @@ -969,7 +969,8 @@ out: /* * ntfs_write_end - Address_space_operations::write_end. */ -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, +int ntfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -1001,7 +1002,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, folio, + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); } diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 9a25fec25f010..1296e6fcc779c 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -708,10 +708,12 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); -int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct folio **foliop, void **fsdata); -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct folio *folio, void *fsdata); +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, struct folio **foliop, + void **fsdata); +int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct folio *folio, + void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int inode_read_data(struct inode *inode, void *data, size_t bytes); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 40b6bce12951e..2203438738f68 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1856,7 +1856,8 @@ out: return ret; } -static int ocfs2_write_begin(struct file *file, struct address_space *mapping, +static int ocfs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -2047,7 +2048,8 @@ out: return copied; } -static int ocfs2_write_end(struct file *file, struct address_space *mapping, +static int ocfs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 98358d405b6ab..8d70f816b0c9d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) } } -static int omfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int omfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/open.c b/fs/open.c index 7828234a7caa4..b29d1e077164b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1204,14 +1204,11 @@ struct file *kernel_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - error = do_dentry_open(f, NULL); + error = vfs_open(path, f); if (error) { fput(f); return ERR_PTR(error); } - - fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 08a6f372a352f..a7ab637767359 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -285,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) return ret; } -static int orangefs_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int orangefs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; @@ -340,9 +341,10 @@ okay: return 0; } -static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int orangefs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -372,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - mark_inode_dirty_sync(file_inode(file)); + mark_inode_dirty_sync(file_inode(iocb->ki_filp)); return copied; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d7310fcf38881..8a3c0d18ec2ea 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -563,7 +563,7 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (IS_ERR(index)) { err = PTR_ERR(index); } else { - err = ovl_do_rename(ofs, dir, temp, dir, index, 0); + err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); dput(index); } out: @@ -762,7 +762,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; - struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct inode *wdir = d_inode(c->workdir); struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper, *trap; struct ovl_cu_creds cc; @@ -829,7 +829,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(upper)) goto cleanup; - err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); + err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); dput(upper); if (err) goto cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index fe493f3ed6b6f..4fc221ea6480c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -107,7 +107,7 @@ out: } /* Caller must hold i_mutex on both workdir and dir */ -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { struct inode *wdir = ofs->workdir->d_inode; @@ -123,7 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); + err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); if (err) goto kill_whiteout; if (flags) @@ -384,7 +384,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; @@ -491,14 +491,14 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; ovl_cleanup(ofs, wdir, upper); } else { - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); if (err) goto out_cleanup; } @@ -774,7 +774,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, goto out_dput_upper; } - err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper); + err = ovl_cleanup_and_whiteout(ofs, upperdir, upper); if (err) goto out_d_drop; @@ -1246,8 +1246,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (err) goto out_dput; - err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, - new_upperdir->d_inode, newdentry, flags); + err = ovl_do_rename(ofs, old_upperdir, olddentry, + new_upperdir, newdentry, flags); if (err) goto out_dput; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 497323128e5fc..42228d10f6b9b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -355,19 +355,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } -static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, - struct dentry *olddentry, struct inode *newdir, +static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, + struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .old_dir = olddir, - .old_dentry = olddentry, + .old_parent = olddir, + .old_dentry = olddentry, .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .new_dir = newdir, - .new_dentry = newdentry, - .flags = flags, + .new_parent = newdir, + .new_dentry = newdentry, + .flags = flags, }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); @@ -828,7 +828,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry); struct ovl_cattr { dev_t rdev; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 474c80d210d18..68cca52ae2acb 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -1235,7 +1235,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ - err = ovl_cleanup_and_whiteout(ofs, dir, index); + err = ovl_cleanup_and_whiteout(ofs, indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index efbf0b2915516..4e7e0711a1ff7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -580,7 +580,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); + err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); if (err) { if (err == -EINVAL) err = 0; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index dcccb4b4a66c7..2b4754c645eea 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1115,7 +1115,7 @@ static void ovl_cleanup_index(struct dentry *dentry) } else if (ovl_index_all(dentry->d_sb)) { /* Whiteout orphan index to block future open by handle */ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), - dir, index); + indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); diff --git a/fs/pipe.c b/fs/pipe.c index 45077c37bad15..731622d0738d4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -963,6 +963,11 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + + /* pipe groks IOCB_NOWAIT */ + res[0]->f_mode |= FMODE_NOWAIT; + res[1]->f_mode |= FMODE_NOWAIT; + /* * Disable permission and pre-content events, but enable legacy * inotify events for legacy users. @@ -997,9 +1002,6 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; - /* pipe groks IOCB_NOWAIT */ - files[0]->f_mode |= FMODE_NOWAIT; - files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 37aa778d1af7b..9eeccff49b2ab 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -352,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int rv = 0; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - - /* If it's a directory, put the number of open fds there */ - if (S_ISDIR(inode->i_mode)) { - rv = proc_readfd_count(inode, &stat->size); - if (rv < 0) - return rv; - } - - return rv; + return proc_readfd_count(inode, &stat->size); } const struct inode_operations proc_fd_inode_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 0ef70e128c4af..81a9474dc3963 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -237,7 +237,7 @@ EXPORT_SYMBOL(generic_llseek_cookie); * @offset: file offset to seek to * @whence: type of seek * - * This is a generic implemenation of ->llseek useable for all normal local + * This is a generic implementation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ diff --git a/fs/select.c b/fs/select.c index 9fb650d03d52e..082cf60c7e235 100644 --- a/fs/select.c +++ b/fs/select.c @@ -192,7 +192,7 @@ static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); - pwq->triggered = 1; + WRITE_ONCE(pwq->triggered, 1); /* * Perform the default wake up operation using a dummy @@ -237,7 +237,7 @@ static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, int rc = -EINTR; set_current_state(state); - if (!pwq->triggered) + if (!READ_ONCE(pwq->triggered)) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index d3437f6644e33..34871a7f3e4b8 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -765,10 +765,10 @@ retry: } rd.old_mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_dir = d_inode(old_parent), + rd.old_parent = old_parent, rd.old_dentry = old_child, rd.new_mnt_idmap = mnt_idmap(new_path.mnt), - rd.new_dir = new_path.dentry->d_inode, + rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, rd.delegated_inode = NULL, diff --git a/fs/stack.c b/fs/stack.c index f189201199443..d8c782e064e3e 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -3,7 +3,7 @@ #include <linux/fs.h> #include <linux/fs_stack.h> -/* does _NOT_ require i_mutex to be held. +/* does _NOT_ require i_rwsem to be held. * * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems @@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for * fsstack_copy_inode_size() to hold some lock around * i_size_write(), otherwise i_size_read() may spin forever (see - * include/linux/fs.h). We don't necessarily hold i_mutex when this + * include/linux/fs.h). We don't necessarily hold i_rwsem when this * is called, so take i_lock for that case. * * And if on 32-bit, continue our effort to keep the two halves of diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index bf311c38d9a89..cee8bec1ea26d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -404,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, * there is a plenty of flash space and the budget will be acquired quickly, * without forcing write-back. The slow path does not make this assumption. */ -static int ubifs_write_begin(struct file *file, struct address_space *mapping, +static int ubifs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -514,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, } } -static int ubifs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ubifs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4386dd845e400..356b75676fa93 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -244,10 +244,12 @@ static void udf_readahead(struct readahead_control *rac) mpage_readahead(rac, udf_get_block); } -static int udf_write_begin(struct file *file, struct address_space *mapping, +static int udf_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; @@ -271,15 +273,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int udf_write_end(struct file *file, struct address_space *mapping, +static int udf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, folio, + return generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 88d0062cfdb9e..0388a1bae326b 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7dc38fdef2eab..8361c00e8fa6f 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) } } -static int ufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int ufs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ufs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ufs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index b492794f8e9a7..af01e3beaa42b 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -300,12 +300,13 @@ static int vboxsf_writepages(struct address_space *mapping, return error; } -static int vboxsf_write_end(struct file *file, struct address_space *mapping, +static int vboxsf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; - struct vboxsf_handle *sf_handle = file->private_data; + struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data; size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; diff --git a/fs/xattr.c b/fs/xattr.c index 600ae97969cf2..8851a5ef34f5a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * * returns the result of the internal setxattr or setsecurity operations. * - * This function requires the caller to lock the inode's i_mutex before it + * This function requires the caller to lock the inode's i_rwsem before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 0029ff880e274..b16b88bfbc3e7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -262,14 +262,12 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); -int block_write_end(struct file *, struct address_space *, - loff_t, unsigned len, unsigned copied, - struct folio *, void *); -int generic_write_end(struct file *, struct address_space *, +int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); +int generic_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); -int cont_write_begin(struct file *, struct address_space *, loff_t, +int cont_write_begin(const struct kiocb *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 25c4a5afbd443..cfb0dd1ea49c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -230,7 +230,7 @@ struct handle_to_path_ctx { * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX+1 sized * buffer. get_name() should return %0 on success, a negative error code - * or error. @get_name will be called without @parent->i_mutex held. + * or error. @get_name will be called without @parent->i_rwsem held. * * get_parent: * @get_parent should find the parent directory for the given @child which @@ -247,7 +247,7 @@ struct handle_to_path_ctx { * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: - * get_parent is called with child->d_inode->i_mutex down + * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) */ diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c412ded9171ed..c2ce8ba05d068 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -175,9 +175,14 @@ static inline bool lock_is_write(struct file_lock *fl) return fl->c.flc_type == F_WRLCK; } +static inline void locks_wake_up_waiter(struct file_lock_core *flc) +{ + wake_up(&flc->flc_wait); +} + static inline void locks_wake_up(struct file_lock *fl) { - wake_up(&fl->c.flc_wait); + locks_wake_up_waiter(&fl->c); } static inline bool locks_can_async_lock(const struct file_operations *fops) diff --git a/include/linux/fs.h b/include/linux/fs.h index ea5b14fe424d9..1b5212de57021 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -446,10 +446,10 @@ struct address_space_operations { void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); @@ -839,7 +839,7 @@ static inline void inode_fake_hash(struct inode *inode) } /* - * inode->i_mutex nesting subclasses for the lock validator: + * inode->i_rwsem nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent @@ -991,7 +991,7 @@ static inline loff_t i_size_read(const struct inode *inode) /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it - * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount + * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) @@ -1924,7 +1924,7 @@ static inline void sb_end_intwrite(struct super_block *sb) * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write - * -> i_mutex (write path, truncate, directory ops, ...) + * -> i_rwsem (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) @@ -2007,20 +2007,20 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from - * @old_dir: parent of source + * @old_parent: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from - * @new_dir: parent of destination + * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; - struct inode *old_dir; + struct dentry *old_parent; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; - struct inode *new_dir; + struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; @@ -3605,9 +3605,10 @@ extern void locked_recursive_removal(struct dentry *, extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); -extern int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata); +extern int simple_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index a19e4bd32e4d3..7773eb870039c 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -200,7 +200,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, */ #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) -#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index 2b1f74b240707..0cc2fa283305b 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -3,7 +3,7 @@ #define _LINUX_FS_STACK_H /* This file defines generic functions used primarily by stackable - * filesystems; none of these functions require i_mutex to be held. + * filesystems; none of these functions require i_rwsem to be held. */ #include <linux/fs.h> diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f43f075852c06..185bd8196503d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -442,7 +442,6 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error); -void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e63fbfbd5b0f3..ce2bcdcadb733 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -751,6 +751,33 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); /** + * write_begin_get_folio - Get folio for write_begin with flags. + * @iocb: The kiocb passed from write_begin (may be NULL). + * @mapping: The address space to search. + * @index: The page cache index. + * @len: Length of data being written. + * + * This is a helper for filesystem write_begin() implementations. + * It wraps __filemap_get_folio(), setting appropriate flags in + * the write begin context. + * + * Return: A folio or an ERR_PTR. + */ +static inline struct folio *write_begin_get_folio(const struct kiocb *iocb, + struct address_space *mapping, pgoff_t index, size_t len) +{ + fgf_t fgp_flags = FGP_WRITEBEGIN; + + fgp_flags |= fgf_set_order(len); + + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + +/** * filemap_get_folio - Find and get a folio. * @mapping: The address_space to search. * @index: The page index. diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 06cc8888199e8..c334f82ed385a 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -19,7 +19,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) return &sb->s_dquot; } -/* i_mutex must being held */ +/* i_rwsem must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 83e36ad4e31be..d70700e5cef88 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -416,8 +416,6 @@ int io_pipe(struct io_kiocb *req, unsigned int issue_flags) ret = create_pipe_files(files, p->flags); if (ret) return ret; - files[0]->f_mode |= FMODE_NOWAIT; - files[1]->f_mode |= FMODE_NOWAIT; if (!!p->file_slot) ret = io_pipe_fixed(req, files, issue_flags); diff --git a/mm/filemap.c b/mm/filemap.c index bada249b9fb76..ba089d75fc868 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4109,7 +4109,7 @@ retry: break; } - status = a_ops->write_begin(file, mapping, pos, bytes, + status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; @@ -4130,7 +4130,7 @@ retry: copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); - status = a_ops->write_end(file, mapping, pos, bytes, copied, + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); diff --git a/mm/shmem.c b/mm/shmem.c index cbcb573cde061..457355cda4a9d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3270,9 +3270,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -3304,9 +3304,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; |