diff options
Diffstat (limited to 'kernel')
49 files changed, 578 insertions, 374 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 367eaf2c78b7e..0ebbbe37a60f0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -347,12 +347,17 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct dentry *d = kern_path_locked(watch->path, parent); + struct dentry *d; + + d = kern_path_locked_negative(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; + + if (d_is_positive(d)) { + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + } inode_unlock(d_backing_inode(parent->dentry)); dput(d); @@ -418,11 +423,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret && ret != -ENOENT) { + if (ret) { audit_put_watch(watch); return ret; } - ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e22..92b606d600207 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2189,7 +2189,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc3aa91a6ba03..5c2e96b19392a 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, int ret; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 2fdf3c978db1b..774e5a5388112 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,5 +89,6 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS("BPF_INTERNAL"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9794446bc8c6c..64c3393e82700 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1583,7 +1583,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -3364,7 +3364,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -6020,7 +6020,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3caf2cd86e65f..63e5b90da1f30 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif @@ -2353,9 +2353,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2392,6 +2420,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 306b604300914..24b70ea3e6ce9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1116,9 +1116,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 3b2bdca9f1d4b..77c8d9487a9ab 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - if (!rmem->priv) { - struct dma_coherent_mem *mem; + struct dma_coherent_mem *mem = rmem->priv; + if (!mem) { mem = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, true); if (IS_ERR(mem)) return PTR_ERR(mem); rmem->priv = mem; } - dma_assign_coherent_memory(dev, rmem->priv); + + /* Warn if the device potentially can't use the reserved memory */ + if (mem->device_base + rmem->size - 1 > + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit)) + dev_warn(dev, "reserved memory is beyond device's set DMA address range\n"); + + dma_assign_coherent_memory(dev, mem); return 0; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71d..8df0dfaaca18e 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index cda127027e48a..051a32988040f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -910,6 +910,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +static bool __dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops) || use_dma_iommu(dev)) + return false; + return !dma_direct_all_ram_mapped(dev); +} + /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check @@ -920,15 +933,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev)) - return true; - - if (unlikely(ops) || use_dma_iommu(dev)) + if (!__dma_addressing_limited(dev)) return false; - return !dma_direct_all_ram_mapped(dev); + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); diff --git a/kernel/events/core.c b/kernel/events/core.c index e93c195659140..95e703891b24f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3943,7 +3943,7 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) - event->pending_kill = POLL_HUP; + event->pending_kill = POLL_ERR; perf_event_wakeup(event); } else { @@ -6075,7 +6075,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && event->attr.pinned)) - return events; + return EPOLLERR; /* * Pin the event->rb by taking event->mmap_mutex; otherwise diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e0..c33ecde016dee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,8 +133,13 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); @@ -253,7 +258,8 @@ repeat: pidfs_exit(p); cgroup_release(p); - thread_pid = get_pid(p->thread_pid); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -282,8 +288,8 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b8..aad55b0e103c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -498,10 +498,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(new->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(new); - return new; } @@ -672,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; @@ -2036,17 +2037,16 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2063,59 +2063,50 @@ static inline void rcu_copy_process(struct task_struct *p) * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { - struct file *pidfd_file; + struct file *pidfs_file; + + /* + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. + */ + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; + /* + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. + */ + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; + } CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); - *ret = pidfd_file; + *ret_file = pidfs_file; return take_fd(pidfd); } -/** - * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper verifies that @pid is still in use, without PIDFD_THREAD the - * task identified by @pid must be a thread-group leader. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; - - return __pidfd_prepare(pid, flags, ret); -} - static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2462,7 +2453,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5c8d43cdb0a31..c05ba7ca00faa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -761,7 +761,7 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) { - struct msi_desc *desc = irq_data_get_msi_desc(irqd); + struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL; if (!desc) return; diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index d6964fc29f513..ef234469baaca 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a2..39278737bb68f 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. diff --git a/kernel/module/main.c b/kernel/module/main.c index a2859dc3eea66..5c6ab20240a6d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2829,6 +2829,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); + codetag_free_module_sections(mod); free_mod_mem(mod); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c9d97ed201227..5f31fdff8a38f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -128,17 +128,13 @@ out_time: out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: - if (new_nsp->pid_ns_for_children) - put_pid_ns(new_nsp->pid_ns_for_children); + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); + put_ipc_ns(new_nsp->ipc_ns); out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); + put_uts_ns(new_nsp->uts_ns); out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); + put_mnt_ns(new_nsp->mnt_ns); out_ns: kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err); @@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) void free_nsproxy(struct nsproxy *ns) { - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns_for_children) - put_pid_ns(ns->pid_ns_for_children); - if (ns->time_ns) - put_time_ns(ns->time_ns); - if (ns->time_ns_for_children) - put_time_ns(ns->time_ns_for_children); + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); diff --git a/kernel/padata.c b/kernel/padata.c index b3d4eacc4f5d8..7eee94166357a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/params.c b/kernel/params.c index 2509f216c9f3c..b92d64161b758 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -760,38 +760,35 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", - name, err); - return NULL; - } + if (kobj) + return to_module_kobject(kobj); - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + if (!mk) + return NULL; + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); + if (IS_ENABLED(CONFIG_MODULES) && !err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); + if (err) { + kobject_put(&mk->kobj); + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", + name, err); + return NULL; } + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + return mk; } @@ -802,7 +799,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -873,7 +870,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); @@ -946,7 +943,9 @@ struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } const struct kobj_type module_ktype = { diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817fd..8317bcbc7cf7d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,6 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { + WARN_ON_ONCE(pid->stashed); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -359,11 +360,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 23c0f4e6cb2ff..338c9917d4ee2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -778,6 +778,8 @@ int hibernate(void) goto Restore; ksys_sync_helper(); + if (filesystem_freeze_enabled) + filesystems_freeze(); error = freeze_processes(); if (error) @@ -846,6 +848,7 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -882,6 +885,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + error = freeze_processes(); if (error) goto exit; @@ -941,6 +947,7 @@ thaw: thaw_processes(); exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: @@ -1029,19 +1036,26 @@ static int software_resume(void) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); - if (error) + if (error) { + filesystems_thaw(); goto Close_Finish; + } error = freeze_kernel_threads(); if (error) { thaw_processes(); + filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6254814d48171..0b0e76324c435 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -962,6 +962,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -992,6 +1020,9 @@ static struct attribute * g[] = { #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; diff --git a/kernel/power/power.h b/kernel/power/power.h index c352dea2f67b5..2eb81662b8fa5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -18,6 +18,10 @@ struct swsusp_info { unsigned long size; } __aligned(PAGE_SIZE); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern bool filesystem_freeze_enabled; +#endif + #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8eaec4ab121d4..76b141b9aac01 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" @@ -374,6 +375,8 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -550,6 +553,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -588,6 +592,8 @@ static int enter_state(suspend_state_t state) ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); } + if (filesystem_freeze_enabled) + filesystems_freeze(); pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); @@ -609,6 +615,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: + filesystems_thaw(); mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 80ff5f933a626..ad13c461b657c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -268,35 +268,26 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { - struct page *page = virt_to_page(addr); struct bio *bio; - int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - pr_err("Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - return -EFAULT; - } - - if (hb) { - bio->bi_end_io = hib_end_io; - bio->bi_private = hb; - atomic_inc(&hb->count); - submit_bio(bio); - } else { - error = submit_bio_wait(bio); - bio_put(bio); - } - - return error; + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; } static int hib_wait_io(struct hib_bio_batch *hb) @@ -316,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -329,8 +320,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, - swsusp_resume_block, swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { pr_err("Swap header not found!\n"); error = -ENODEV; @@ -380,36 +371,30 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (hb) { - src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_io(hb); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(GFP_NOIO | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - hb = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1041,7 +1026,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -1065,7 +1050,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, offset, buf, hb); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1590,8 +1578,8 @@ int swsusp_check(bool exclusive) BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; @@ -1599,9 +1587,9 @@ int swsusp_check(bool exclusive) memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { error = -EINVAL; } @@ -1650,13 +1638,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1a19d69b91ed3..816f07f9d30f1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -871,7 +897,16 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } struct cpufreq_governor schedutil_gov = { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 66bcd40a28ca1..f5133249fd4d9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -163,7 +163,7 @@ enum scx_ops_flags { /* * CPU cgroup support flags */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | @@ -1118,8 +1118,38 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(mask, op, rq, args...) \ do { \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ scx_ops.op(args); \ @@ -1127,11 +1157,14 @@ do { \ } else { \ scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(mask, op, rq, args...) \ ({ \ __typeof__(scx_ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = scx_ops.op(args); \ @@ -1139,6 +1172,7 @@ do { \ } else { \ __ret = scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1153,31 +1187,31 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \ ({ \ __typeof__(scx_ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \ ({ \ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -2172,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2269,7 +2303,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags add_nr_running(rq, 1); if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2283,7 +2317,7 @@ out: __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { unsigned long opss; @@ -2304,7 +2338,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) BUG(); case SCX_OPSS_QUEUED: if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2337,7 +2371,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2353,11 +2387,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags */ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false); } if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2377,7 +2411,7 @@ static void yield_task_scx(struct rq *rq) struct task_struct *p = rq->curr; if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } @@ -2387,7 +2421,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) struct task_struct *from = rq->curr; if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to); else return false; } @@ -2945,7 +2979,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2990,7 +3024,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq), prev_on_scx ? prev : NULL); flush_dispatch_buf(rq); @@ -3104,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } @@ -3112,7 +3146,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3193,8 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3207,7 +3240,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3348,7 +3381,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, * verifier. */ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3385,7 +3418,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag *ddsp_taskp = p; cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + select_cpu, NULL, p, prev_cpu, wake_flags); p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) @@ -3430,8 +3463,8 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) @@ -3444,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "cpu %d going %s, exiting scheduler", cpu, @@ -3550,7 +3583,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3627,7 +3660,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args); if (unlikely(ret)) { ret = ops_sanitize_err("init_task", ret); return ret; @@ -3668,9 +3701,10 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool static void scx_ops_enable_task(struct task_struct *p) { + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -3684,20 +3718,22 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void scx_ops_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -3726,7 +3762,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3835,7 +3871,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) @@ -3851,8 +3887,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3899,35 +3935,6 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; -static bool cgroup_warned_missing_weight; -static bool cgroup_warned_missing_idle; - -static void scx_cgroup_warn_missing_weight(struct task_group *tg) -{ - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_weight) - return; - - if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", - scx_ops.name); - cgroup_warned_missing_weight = true; -} - -static void scx_cgroup_warn_missing_idle(struct task_group *tg) -{ - if (!scx_cgroup_enabled || cgroup_warned_missing_idle) - return; - - if (!tg->idle) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", - scx_ops.name); - cgroup_warned_missing_idle = true; -} int scx_tg_online(struct task_group *tg) { @@ -3937,14 +3944,12 @@ int scx_tg_online(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_weight(tg); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(cgroup_init)) { struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); if (ret) ret = ops_sanitize_err("cgroup_init", ret); @@ -3966,7 +3971,7 @@ void scx_tg_offline(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); @@ -3999,7 +4004,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) continue; if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -4013,8 +4018,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -4032,8 +4037,8 @@ void scx_cgroup_move_task(struct task_struct *p) * cgrp_moving_from set. */ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } @@ -4052,8 +4057,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } out_unlock: @@ -4066,7 +4071,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx_weight = weight; } @@ -4076,9 +4081,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_idle(tg); - percpu_up_read(&scx_cgroup_rwsem); + /* TODO: Implement ops->cgroup_set_idle() */ } static void scx_cgroup_lock(void) @@ -4257,7 +4260,7 @@ static void scx_cgroup_exit(void) continue; rcu_read_unlock(); - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); rcu_read_lock(); css_put(css); @@ -4272,9 +4275,6 @@ static int scx_cgroup_init(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - cgroup_warned_missing_weight = false; - cgroup_warned_missing_idle = false; - /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. @@ -4284,9 +4284,6 @@ static int scx_cgroup_init(void) struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - scx_cgroup_warn_missing_weight(tg); - scx_cgroup_warn_missing_idle(tg); - if ((tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; @@ -4300,7 +4297,7 @@ static int scx_cgroup_init(void) continue; rcu_read_unlock(); - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); @@ -4623,7 +4620,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4639,7 +4636,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); @@ -4797,7 +4794,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); @@ -5004,7 +5001,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, if (SCX_HAS_OP(dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -5051,7 +5048,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (SCX_HAS_OP(dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5108,7 +5105,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) used = seq_buf_used(&ns); if (SCX_HAS_OP(dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle); ops_dump_exit(); } @@ -5252,6 +5249,9 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + return 0; } @@ -5364,7 +5364,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_idle_enable(ops); if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL); if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); @@ -6827,6 +6827,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; @@ -7113,13 +7119,32 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_ops_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } @@ -7350,12 +7375,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index cb343ca889e02..e67a19a071c11 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -674,7 +674,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) * managed by put_prev_task_idle()/set_next_task_idle(). */ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); /* * Update the idle masks: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e43993a4e5807..0fb9bf995a479 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7081,9 +7081,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7093,6 +7090,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f1955..a009c91f7b05f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; + u64 offset, orig_offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; @@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 01c2ab1e89719..32ef27c71b57a 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *tk) /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3679a6d189346..3f6a7bdc6edf6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -893,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) -{ - blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); -} - static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { @@ -1089,8 +1084,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); @@ -1125,7 +1118,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); @@ -1462,7 +1454,6 @@ static const struct { [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; @@ -1896,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; + if (opf & REQ_ATOMIC) + rwbs[i++] = 'U'; rwbs[i] = '\0'; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76b..ba7ff14f5339b 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -454,7 +454,8 @@ static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head * struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist) { + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { if (!within_module(node->addr, mod)) continue; if (delete_fprobe_node(node)) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a8a02868b4358..6981830c31285 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1297,6 +1297,8 @@ void ftrace_free_filter(struct ftrace_ops *ops) return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3434,7 +3436,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** /* Copy the subops hash */ *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); - if (!filter_hash) + if (!*filter_hash) return -ENOMEM; /* Remove any notrace functions from the copy */ remove_hash(*filter_hash, subops_hash->notrace_hash); @@ -3443,6 +3445,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** size_bits); if (ret < 0) { free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; return ret; } } @@ -3472,6 +3475,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** subops_hash->notrace_hash); if (ret < 0) { free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; return ret; } } @@ -3490,8 +3494,8 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; int ret; @@ -3605,6 +3609,9 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * } } + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + temp_hash.filter_hash = *filter_hash; temp_hash.notrace_hash = *notrace_hash; } @@ -3625,8 +3632,8 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) @@ -3699,8 +3706,11 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, } ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); - if (!ret) + if (!ret) { ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + } if (ret) { /* Put back the original hash */ @@ -5954,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a241..3f9bf562beea2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ddf6b17215ca..5b8db27fb6ef3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6043,8 +6043,10 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) tscratch = tr->scratch; /* if there is no tscrach, module_delta must be NULL. */ module_delta = READ_ONCE(tr->module_delta); - if (!module_delta || tscratch->entries[0].mod_addr > addr) + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { return addr + tr->text_delta; + } /* Note that entries must be sorted. */ nr_entries = tscratch->nr_entries; @@ -6821,13 +6823,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a50..5d64a18cacacc 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3e..beee3f8d75444 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee40d4e6ad1cc..4ef4df6623a8d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned long, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32b..916555f0de811 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c16..2048560264bb4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d913..6e87ae2a1a66b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c519..4e37a0f6aaa38 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -633,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d89906..3e5c47b6d7b29 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index fee40ffbd4906..b9ab06c995432 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1042,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -1056,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95eea..424751cdf31f9 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f67..35cf76c75dd76 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2ef2e1b800916..2f844c279a3e0 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ |