summaryrefslogtreecommitdiff
path: root/fs/file_table.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-10-08 13:05:18 +0200
committerChristian Brauner <brauner@kernel.org>2024-10-30 09:58:02 +0100
commit62eec753cae265002043872ba419d0887fe33ec6 (patch)
treea0b6604acca7446936589359cc5c70828b9b50a4 /fs/file_table.c
parent8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b (diff)
parent90ee6ed776c06435a3fe79c7f5344761f52e1760 (diff)
Merge patch series "fs: introduce file_ref_t"
Christian Brauner <brauner@kernel.org> says: As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. As suggested by Linus, add a file specific variant instead of making this a generic library. I've been testing this with will-it-scale using a multi-threaded fstat() on the same file descriptor on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on workloads with 256+ and more threads comparing v6.12-rc1 as base with and without these patches applied. * patches from https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-0-387e24dc9163@kernel.org: fs: port files to file_ref fs: add file_ref fs: protect backing files with rcu Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-0-387e24dc9163@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/file_table.c')
-rw-r--r--fs/file_table.c49
1 files changed, 37 insertions, 12 deletions
diff --git a/fs/file_table.c b/fs/file_table.c
index eed5ffad9997c..db4fde6fe620d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -40,13 +40,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
+static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path user_path;
+ union {
+ struct path user_path;
+ freeptr_t bf_freeptr;
+ };
};
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +72,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f));
- kfree(backing_file(f));
+ kmem_cache_free(bfilp_cachep, backing_file(f));
} else {
kmem_cache_free(filp_cachep, f);
}
@@ -165,16 +169,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- f->f_flags = flags;
- f->f_mode = OPEN_FMODE(flags);
- /* f->f_version: 0 */
+ memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+ f->f_flags = flags;
+ f->f_mode = OPEN_FMODE(flags);
+
+ f->f_op = NULL;
+ f->f_mapping = NULL;
+ f->private_data = NULL;
+ f->f_inode = NULL;
+ f->f_owner = NULL;
+#ifdef CONFIG_EPOLL
+ f->f_ep = NULL;
+#endif
+
+ f->f_iocb_flags = 0;
+ f->f_pos = 0;
+ f->f_wb_err = 0;
+ f->f_sb_err = 0;
/*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
- atomic_long_set(&f->f_count, 1);
+ file_ref_init(&f->f_ref, 1);
return 0;
}
@@ -206,7 +226,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -240,7 +260,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -267,13 +287,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff;
int error;
- ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error)) {
- kfree(ff);
+ kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error);
}
@@ -479,7 +499,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +532,7 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count))
+ if (file_ref_put(&file->f_ref))
__fput(file);
}
@@ -529,6 +549,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+ args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
+ bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
+ &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}